Showing error 899

User: Jiri Slaby
Error type: Double Lock
Error type description: Some lock is locked twice unintentionally in a sequence
File location: kernel/futex.c
Line in file: 908
Project: Linux Kernel
Project version: 2.6.28
Tools: Undetermined 1
Entered: 2012-02-27 21:22:42 UTC


Source:

   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   6 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   7 *
   8 *  Removed page pinning, fix privately mapped COW pages and other cleanups
   9 *  (C) Copyright 2003, 2004 Jamie Lokier
  10 *
  11 *  Robust futex support started by Ingo Molnar
  12 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  13 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  14 *
  15 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  16 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  17 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  18 *
  19 *  PRIVATE futexes by Eric Dumazet
  20 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  21 *
  22 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  23 *  enough at me, Linus for the original (flawed) idea, Matthew
  24 *  Kirkwood for proof-of-concept implementation.
  25 *
  26 *  "The futexes are also cursed."
  27 *  "But they come in a choice of three flavours!"
  28 *
  29 *  This program is free software; you can redistribute it and/or modify
  30 *  it under the terms of the GNU General Public License as published by
  31 *  the Free Software Foundation; either version 2 of the License, or
  32 *  (at your option) any later version.
  33 *
  34 *  This program is distributed in the hope that it will be useful,
  35 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  36 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  37 *  GNU General Public License for more details.
  38 *
  39 *  You should have received a copy of the GNU General Public License
  40 *  along with this program; if not, write to the Free Software
  41 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  42 */
  43#include <linux/slab.h>
  44#include <linux/poll.h>
  45#include <linux/fs.h>
  46#include <linux/file.h>
  47#include <linux/jhash.h>
  48#include <linux/init.h>
  49#include <linux/futex.h>
  50#include <linux/mount.h>
  51#include <linux/pagemap.h>
  52#include <linux/syscalls.h>
  53#include <linux/signal.h>
  54#include <linux/module.h>
  55#include <linux/magic.h>
  56#include <linux/pid.h>
  57#include <linux/nsproxy.h>
  58
  59#include <asm/futex.h>
  60
  61#include "rtmutex_common.h"
  62
  63int __read_mostly futex_cmpxchg_enabled;
  64
  65#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
  66
  67/*
  68 * Priority Inheritance state:
  69 */
  70struct futex_pi_state {
  71        /*
  72         * list of 'owned' pi_state instances - these have to be
  73         * cleaned up in do_exit() if the task exits prematurely:
  74         */
  75        struct list_head list;
  76
  77        /*
  78         * The PI object:
  79         */
  80        struct rt_mutex pi_mutex;
  81
  82        struct task_struct *owner;
  83        atomic_t refcount;
  84
  85        union futex_key key;
  86};
  87
  88/*
  89 * We use this hashed waitqueue instead of a normal wait_queue_t, so
  90 * we can wake only the relevant ones (hashed queues may be shared).
  91 *
  92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
  93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
  94 * The order of wakup is always to make the first condition true, then
  95 * wake up q->waiters, then make the second condition true.
  96 */
  97struct futex_q {
  98        struct plist_node list;
  99        wait_queue_head_t waiters;
 100
 101        /* Which hash list lock to use: */
 102        spinlock_t *lock_ptr;
 103
 104        /* Key which the futex is hashed on: */
 105        union futex_key key;
 106
 107        /* Optional priority inheritance state: */
 108        struct futex_pi_state *pi_state;
 109        struct task_struct *task;
 110
 111        /* Bitset for the optional bitmasked wakeup */
 112        u32 bitset;
 113};
 114
 115/*
 116 * Split the global futex_lock into every hash list lock.
 117 */
 118struct futex_hash_bucket {
 119        spinlock_t lock;
 120        struct plist_head chain;
 121};
 122
 123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 124
 125/*
 126 * Take mm->mmap_sem, when futex is shared
 127 */
 128static inline void futex_lock_mm(struct rw_semaphore *fshared)
 129{
 130        if (fshared)
 131                down_read(fshared);
 132}
 133
 134/*
 135 * Release mm->mmap_sem, when the futex is shared
 136 */
 137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
 138{
 139        if (fshared)
 140                up_read(fshared);
 141}
 142
 143/*
 144 * We hash on the keys returned from get_futex_key (see below).
 145 */
 146static struct futex_hash_bucket *hash_futex(union futex_key *key)
 147{
 148        u32 hash = jhash2((u32*)&key->both.word,
 149                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 150                          key->both.offset);
 151        return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
 152}
 153
 154/*
 155 * Return 1 if two futex_keys are equal, 0 otherwise.
 156 */
 157static inline int match_futex(union futex_key *key1, union futex_key *key2)
 158{
 159        return (key1->both.word == key2->both.word
 160                && key1->both.ptr == key2->both.ptr
 161                && key1->both.offset == key2->both.offset);
 162}
 163
 164/**
 165 * get_futex_key - Get parameters which are the keys for a futex.
 166 * @uaddr: virtual address of the futex
 167 * @shared: NULL for a PROCESS_PRIVATE futex,
 168 *        &current->mm->mmap_sem for a PROCESS_SHARED futex
 169 * @key: address where result is stored.
 170 *
 171 * Returns a negative error code or 0
 172 * The key words are stored in *key on success.
 173 *
 174 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
 175 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 176 * We can usually work out the index without swapping in the page.
 177 *
 178 * fshared is NULL for PROCESS_PRIVATE futexes
 179 * For other futexes, it points to &current->mm->mmap_sem and
 180 * caller must have taken the reader lock. but NOT any spinlocks.
 181 */
 182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 183                         union futex_key *key)
 184{
 185        unsigned long address = (unsigned long)uaddr;
 186        struct mm_struct *mm = current->mm;
 187        struct vm_area_struct *vma;
 188        struct page *page;
 189        int err;
 190
 191        /*
 192         * The futex address must be "naturally" aligned.
 193         */
 194        key->both.offset = address % PAGE_SIZE;
 195        if (unlikely((address % sizeof(u32)) != 0))
 196                return -EINVAL;
 197        address -= key->both.offset;
 198
 199        /*
 200         * PROCESS_PRIVATE futexes are fast.
 201         * As the mm cannot disappear under us and the 'key' only needs
 202         * virtual address, we dont even have to find the underlying vma.
 203         * Note : We do have to check 'uaddr' is a valid user address,
 204         *        but access_ok() should be faster than find_vma()
 205         */
 206        if (!fshared) {
 207                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
 208                        return -EFAULT;
 209                key->private.mm = mm;
 210                key->private.address = address;
 211                return 0;
 212        }
 213        /*
 214         * The futex is hashed differently depending on whether
 215         * it's in a shared or private mapping.  So check vma first.
 216         */
 217        vma = find_extend_vma(mm, address);
 218        if (unlikely(!vma))
 219                return -EFAULT;
 220
 221        /*
 222         * Permissions.
 223         */
 224        if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
 225                return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
 226
 227        /*
 228         * Private mappings are handled in a simple way.
 229         *
 230         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 231         * it's a read-only handle, it's expected that futexes attach to
 232         * the object not the particular process.  Therefore we use
 233         * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
 234         * mappings of _writable_ handles.
 235         */
 236        if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
 237                key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
 238                key->private.mm = mm;
 239                key->private.address = address;
 240                return 0;
 241        }
 242
 243        /*
 244         * Linear file mappings are also simple.
 245         */
 246        key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
 247        key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
 248        if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
 249                key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
 250                                     + vma->vm_pgoff);
 251                return 0;
 252        }
 253
 254        /*
 255         * We could walk the page table to read the non-linear
 256         * pte, and get the page index without fetching the page
 257         * from swap.  But that's a lot of code to duplicate here
 258         * for a rare case, so we simply fetch the page.
 259         */
 260        err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
 261        if (err >= 0) {
 262                key->shared.pgoff =
 263                        page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 264                put_page(page);
 265                return 0;
 266        }
 267        return err;
 268}
 269
 270/*
 271 * Take a reference to the resource addressed by a key.
 272 * Can be called while holding spinlocks.
 273 *
 274 */
 275static void get_futex_key_refs(union futex_key *key)
 276{
 277        if (key->both.ptr == NULL)
 278                return;
 279        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 280                case FUT_OFF_INODE:
 281                        atomic_inc(&key->shared.inode->i_count);
 282                        break;
 283                case FUT_OFF_MMSHARED:
 284                        atomic_inc(&key->private.mm->mm_count);
 285                        break;
 286        }
 287}
 288
 289/*
 290 * Drop a reference to the resource addressed by a key.
 291 * The hash bucket spinlock must not be held.
 292 */
 293static void drop_futex_key_refs(union futex_key *key)
 294{
 295        if (!key->both.ptr)
 296                return;
 297        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 298                case FUT_OFF_INODE:
 299                        iput(key->shared.inode);
 300                        break;
 301                case FUT_OFF_MMSHARED:
 302                        mmdrop(key->private.mm);
 303                        break;
 304        }
 305}
 306
 307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
 308{
 309        u32 curval;
 310
 311        pagefault_disable();
 312        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
 313        pagefault_enable();
 314
 315        return curval;
 316}
 317
 318static int get_futex_value_locked(u32 *dest, u32 __user *from)
 319{
 320        int ret;
 321
 322        pagefault_disable();
 323        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 324        pagefault_enable();
 325
 326        return ret ? -EFAULT : 0;
 327}
 328
 329/*
 330 * Fault handling.
 331 * if fshared is non NULL, current->mm->mmap_sem is already held
 332 */
 333static int futex_handle_fault(unsigned long address,
 334                              struct rw_semaphore *fshared, int attempt)
 335{
 336        struct vm_area_struct * vma;
 337        struct mm_struct *mm = current->mm;
 338        int ret = -EFAULT;
 339
 340        if (attempt > 2)
 341                return ret;
 342
 343        if (!fshared)
 344                down_read(&mm->mmap_sem);
 345        vma = find_vma(mm, address);
 346        if (vma && address >= vma->vm_start &&
 347            (vma->vm_flags & VM_WRITE)) {
 348                int fault;
 349                fault = handle_mm_fault(mm, vma, address, 1);
 350                if (unlikely((fault & VM_FAULT_ERROR))) {
 351#if 0
 352                        /* XXX: let's do this when we verify it is OK */
 353                        if (ret & VM_FAULT_OOM)
 354                                ret = -ENOMEM;
 355#endif
 356                } else {
 357                        ret = 0;
 358                        if (fault & VM_FAULT_MAJOR)
 359                                current->maj_flt++;
 360                        else
 361                                current->min_flt++;
 362                }
 363        }
 364        if (!fshared)
 365                up_read(&mm->mmap_sem);
 366        return ret;
 367}
 368
 369/*
 370 * PI code:
 371 */
 372static int refill_pi_state_cache(void)
 373{
 374        struct futex_pi_state *pi_state;
 375
 376        if (likely(current->pi_state_cache))
 377                return 0;
 378
 379        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 380
 381        if (!pi_state)
 382                return -ENOMEM;
 383
 384        INIT_LIST_HEAD(&pi_state->list);
 385        /* pi_mutex gets initialized later */
 386        pi_state->owner = NULL;
 387        atomic_set(&pi_state->refcount, 1);
 388
 389        current->pi_state_cache = pi_state;
 390
 391        return 0;
 392}
 393
 394static struct futex_pi_state * alloc_pi_state(void)
 395{
 396        struct futex_pi_state *pi_state = current->pi_state_cache;
 397
 398        WARN_ON(!pi_state);
 399        current->pi_state_cache = NULL;
 400
 401        return pi_state;
 402}
 403
 404static void free_pi_state(struct futex_pi_state *pi_state)
 405{
 406        if (!atomic_dec_and_test(&pi_state->refcount))
 407                return;
 408
 409        /*
 410         * If pi_state->owner is NULL, the owner is most probably dying
 411         * and has cleaned up the pi_state already
 412         */
 413        if (pi_state->owner) {
 414                spin_lock_irq(&pi_state->owner->pi_lock);
 415                list_del_init(&pi_state->list);
 416                spin_unlock_irq(&pi_state->owner->pi_lock);
 417
 418                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
 419        }
 420
 421        if (current->pi_state_cache)
 422                kfree(pi_state);
 423        else {
 424                /*
 425                 * pi_state->list is already empty.
 426                 * clear pi_state->owner.
 427                 * refcount is at 0 - put it back to 1.
 428                 */
 429                pi_state->owner = NULL;
 430                atomic_set(&pi_state->refcount, 1);
 431                current->pi_state_cache = pi_state;
 432        }
 433}
 434
 435/*
 436 * Look up the task based on what TID userspace gave us.
 437 * We dont trust it.
 438 */
 439static struct task_struct * futex_find_get_task(pid_t pid)
 440{
 441        struct task_struct *p;
 442
 443        rcu_read_lock();
 444        p = find_task_by_vpid(pid);
 445        if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
 446                p = ERR_PTR(-ESRCH);
 447        else
 448                get_task_struct(p);
 449
 450        rcu_read_unlock();
 451
 452        return p;
 453}
 454
 455/*
 456 * This task is holding PI mutexes at exit time => bad.
 457 * Kernel cleans up PI-state, but userspace is likely hosed.
 458 * (Robust-futex cleanup is separate and might save the day for userspace.)
 459 */
 460void exit_pi_state_list(struct task_struct *curr)
 461{
 462        struct list_head *next, *head = &curr->pi_state_list;
 463        struct futex_pi_state *pi_state;
 464        struct futex_hash_bucket *hb;
 465        union futex_key key;
 466
 467        if (!futex_cmpxchg_enabled)
 468                return;
 469        /*
 470         * We are a ZOMBIE and nobody can enqueue itself on
 471         * pi_state_list anymore, but we have to be careful
 472         * versus waiters unqueueing themselves:
 473         */
 474        spin_lock_irq(&curr->pi_lock);
 475        while (!list_empty(head)) {
 476
 477                next = head->next;
 478                pi_state = list_entry(next, struct futex_pi_state, list);
 479                key = pi_state->key;
 480                hb = hash_futex(&key);
 481                spin_unlock_irq(&curr->pi_lock);
 482
 483                spin_lock(&hb->lock);
 484
 485                spin_lock_irq(&curr->pi_lock);
 486                /*
 487                 * We dropped the pi-lock, so re-check whether this
 488                 * task still owns the PI-state:
 489                 */
 490                if (head->next != next) {
 491                        spin_unlock(&hb->lock);
 492                        continue;
 493                }
 494
 495                WARN_ON(pi_state->owner != curr);
 496                WARN_ON(list_empty(&pi_state->list));
 497                list_del_init(&pi_state->list);
 498                pi_state->owner = NULL;
 499                spin_unlock_irq(&curr->pi_lock);
 500
 501                rt_mutex_unlock(&pi_state->pi_mutex);
 502
 503                spin_unlock(&hb->lock);
 504
 505                spin_lock_irq(&curr->pi_lock);
 506        }
 507        spin_unlock_irq(&curr->pi_lock);
 508}
 509
 510static int
 511lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 512                union futex_key *key, struct futex_pi_state **ps)
 513{
 514        struct futex_pi_state *pi_state = NULL;
 515        struct futex_q *this, *next;
 516        struct plist_head *head;
 517        struct task_struct *p;
 518        pid_t pid = uval & FUTEX_TID_MASK;
 519
 520        head = &hb->chain;
 521
 522        plist_for_each_entry_safe(this, next, head, list) {
 523                if (match_futex(&this->key, key)) {
 524                        /*
 525                         * Another waiter already exists - bump up
 526                         * the refcount and return its pi_state:
 527                         */
 528                        pi_state = this->pi_state;
 529                        /*
 530                         * Userspace might have messed up non PI and PI futexes
 531                         */
 532                        if (unlikely(!pi_state))
 533                                return -EINVAL;
 534
 535                        WARN_ON(!atomic_read(&pi_state->refcount));
 536                        WARN_ON(pid && pi_state->owner &&
 537                                pi_state->owner->pid != pid);
 538
 539                        atomic_inc(&pi_state->refcount);
 540                        *ps = pi_state;
 541
 542                        return 0;
 543                }
 544        }
 545
 546        /*
 547         * We are the first waiter - try to look up the real owner and attach
 548         * the new pi_state to it, but bail out when TID = 0
 549         */
 550        if (!pid)
 551                return -ESRCH;
 552        p = futex_find_get_task(pid);
 553        if (IS_ERR(p))
 554                return PTR_ERR(p);
 555
 556        /*
 557         * We need to look at the task state flags to figure out,
 558         * whether the task is exiting. To protect against the do_exit
 559         * change of the task flags, we do this protected by
 560         * p->pi_lock:
 561         */
 562        spin_lock_irq(&p->pi_lock);
 563        if (unlikely(p->flags & PF_EXITING)) {
 564                /*
 565                 * The task is on the way out. When PF_EXITPIDONE is
 566                 * set, we know that the task has finished the
 567                 * cleanup:
 568                 */
 569                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
 570
 571                spin_unlock_irq(&p->pi_lock);
 572                put_task_struct(p);
 573                return ret;
 574        }
 575
 576        pi_state = alloc_pi_state();
 577
 578        /*
 579         * Initialize the pi_mutex in locked state and make 'p'
 580         * the owner of it:
 581         */
 582        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 583
 584        /* Store the key for possible exit cleanups: */
 585        pi_state->key = *key;
 586
 587        WARN_ON(!list_empty(&pi_state->list));
 588        list_add(&pi_state->list, &p->pi_state_list);
 589        pi_state->owner = p;
 590        spin_unlock_irq(&p->pi_lock);
 591
 592        put_task_struct(p);
 593
 594        *ps = pi_state;
 595
 596        return 0;
 597}
 598
 599/*
 600 * The hash bucket lock must be held when this is called.
 601 * Afterwards, the futex_q must not be accessed.
 602 */
 603static void wake_futex(struct futex_q *q)
 604{
 605        plist_del(&q->list, &q->list.plist);
 606        /*
 607         * The lock in wake_up_all() is a crucial memory barrier after the
 608         * plist_del() and also before assigning to q->lock_ptr.
 609         */
 610        wake_up_all(&q->waiters);
 611        /*
 612         * The waiting task can free the futex_q as soon as this is written,
 613         * without taking any locks.  This must come last.
 614         *
 615         * A memory barrier is required here to prevent the following store
 616         * to lock_ptr from getting ahead of the wakeup. Clearing the lock
 617         * at the end of wake_up_all() does not prevent this store from
 618         * moving.
 619         */
 620        smp_wmb();
 621        q->lock_ptr = NULL;
 622}
 623
 624static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 625{
 626        struct task_struct *new_owner;
 627        struct futex_pi_state *pi_state = this->pi_state;
 628        u32 curval, newval;
 629
 630        if (!pi_state)
 631                return -EINVAL;
 632
 633        spin_lock(&pi_state->pi_mutex.wait_lock);
 634        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 635
 636        /*
 637         * This happens when we have stolen the lock and the original
 638         * pending owner did not enqueue itself back on the rt_mutex.
 639         * Thats not a tragedy. We know that way, that a lock waiter
 640         * is on the fly. We make the futex_q waiter the pending owner.
 641         */
 642        if (!new_owner)
 643                new_owner = this->task;
 644
 645        /*
 646         * We pass it to the next owner. (The WAITERS bit is always
 647         * kept enabled while there is PI state around. We must also
 648         * preserve the owner died bit.)
 649         */
 650        if (!(uval & FUTEX_OWNER_DIED)) {
 651                int ret = 0;
 652
 653                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 654
 655                curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
 656
 657                if (curval == -EFAULT)
 658                        ret = -EFAULT;
 659                else if (curval != uval)
 660                        ret = -EINVAL;
 661                if (ret) {
 662                        spin_unlock(&pi_state->pi_mutex.wait_lock);
 663                        return ret;
 664                }
 665        }
 666
 667        spin_lock_irq(&pi_state->owner->pi_lock);
 668        WARN_ON(list_empty(&pi_state->list));
 669        list_del_init(&pi_state->list);
 670        spin_unlock_irq(&pi_state->owner->pi_lock);
 671
 672        spin_lock_irq(&new_owner->pi_lock);
 673        WARN_ON(!list_empty(&pi_state->list));
 674        list_add(&pi_state->list, &new_owner->pi_state_list);
 675        pi_state->owner = new_owner;
 676        spin_unlock_irq(&new_owner->pi_lock);
 677
 678        spin_unlock(&pi_state->pi_mutex.wait_lock);
 679        rt_mutex_unlock(&pi_state->pi_mutex);
 680
 681        return 0;
 682}
 683
 684static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 685{
 686        u32 oldval;
 687
 688        /*
 689         * There is no waiter, so we unlock the futex. The owner died
 690         * bit has not to be preserved here. We are the owner:
 691         */
 692        oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
 693
 694        if (oldval == -EFAULT)
 695                return oldval;
 696        if (oldval != uval)
 697                return -EAGAIN;
 698
 699        return 0;
 700}
 701
 702/*
 703 * Express the locking dependencies for lockdep:
 704 */
 705static inline void
 706double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 707{
 708        if (hb1 <= hb2) {
 709                spin_lock(&hb1->lock);
 710                if (hb1 < hb2)
 711                        spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
 712        } else { /* hb1 > hb2 */
 713                spin_lock(&hb2->lock);
 714                spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
 715        }
 716}
 717
 718/*
 719 * Wake up all waiters hashed on the physical page that is mapped
 720 * to this virtual address:
 721 */
 722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 723                      int nr_wake, u32 bitset)
 724{
 725        struct futex_hash_bucket *hb;
 726        struct futex_q *this, *next;
 727        struct plist_head *head;
 728        union futex_key key;
 729        int ret;
 730
 731        if (!bitset)
 732                return -EINVAL;
 733
 734        futex_lock_mm(fshared);
 735
 736        ret = get_futex_key(uaddr, fshared, &key);
 737        if (unlikely(ret != 0))
 738                goto out;
 739
 740        hb = hash_futex(&key);
 741        spin_lock(&hb->lock);
 742        head = &hb->chain;
 743
 744        plist_for_each_entry_safe(this, next, head, list) {
 745                if (match_futex (&this->key, &key)) {
 746                        if (this->pi_state) {
 747                                ret = -EINVAL;
 748                                break;
 749                        }
 750
 751                        /* Check if one of the bits is set in both bitsets */
 752                        if (!(this->bitset & bitset))
 753                                continue;
 754
 755                        wake_futex(this);
 756                        if (++ret >= nr_wake)
 757                                break;
 758                }
 759        }
 760
 761        spin_unlock(&hb->lock);
 762out:
 763        futex_unlock_mm(fshared);
 764        return ret;
 765}
 766
 767/*
 768 * Wake up all waiters hashed on the physical page that is mapped
 769 * to this virtual address:
 770 */
 771static int
 772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
 773              u32 __user *uaddr2,
 774              int nr_wake, int nr_wake2, int op)
 775{
 776        union futex_key key1, key2;
 777        struct futex_hash_bucket *hb1, *hb2;
 778        struct plist_head *head;
 779        struct futex_q *this, *next;
 780        int ret, op_ret, attempt = 0;
 781
 782retryfull:
 783        futex_lock_mm(fshared);
 784
 785        ret = get_futex_key(uaddr1, fshared, &key1);
 786        if (unlikely(ret != 0))
 787                goto out;
 788        ret = get_futex_key(uaddr2, fshared, &key2);
 789        if (unlikely(ret != 0))
 790                goto out;
 791
 792        hb1 = hash_futex(&key1);
 793        hb2 = hash_futex(&key2);
 794
 795retry:
 796        double_lock_hb(hb1, hb2);
 797
 798        op_ret = futex_atomic_op_inuser(op, uaddr2);
 799        if (unlikely(op_ret < 0)) {
 800                u32 dummy;
 801
 802                spin_unlock(&hb1->lock);
 803                if (hb1 != hb2)
 804                        spin_unlock(&hb2->lock);
 805
 806#ifndef CONFIG_MMU
 807                /*
 808                 * we don't get EFAULT from MMU faults if we don't have an MMU,
 809                 * but we might get them from range checking
 810                 */
 811                ret = op_ret;
 812                goto out;
 813#endif
 814
 815                if (unlikely(op_ret != -EFAULT)) {
 816                        ret = op_ret;
 817                        goto out;
 818                }
 819
 820                /*
 821                 * futex_atomic_op_inuser needs to both read and write
 822                 * *(int __user *)uaddr2, but we can't modify it
 823                 * non-atomically.  Therefore, if get_user below is not
 824                 * enough, we need to handle the fault ourselves, while
 825                 * still holding the mmap_sem.
 826                 */
 827                if (attempt++) {
 828                        ret = futex_handle_fault((unsigned long)uaddr2,
 829                                                 fshared, attempt);
 830                        if (ret)
 831                                goto out;
 832                        goto retry;
 833                }
 834
 835                /*
 836                 * If we would have faulted, release mmap_sem,
 837                 * fault it in and start all over again.
 838                 */
 839                futex_unlock_mm(fshared);
 840
 841                ret = get_user(dummy, uaddr2);
 842                if (ret)
 843                        return ret;
 844
 845                goto retryfull;
 846        }
 847
 848        head = &hb1->chain;
 849
 850        plist_for_each_entry_safe(this, next, head, list) {
 851                if (match_futex (&this->key, &key1)) {
 852                        wake_futex(this);
 853                        if (++ret >= nr_wake)
 854                                break;
 855                }
 856        }
 857
 858        if (op_ret > 0) {
 859                head = &hb2->chain;
 860
 861                op_ret = 0;
 862                plist_for_each_entry_safe(this, next, head, list) {
 863                        if (match_futex (&this->key, &key2)) {
 864                                wake_futex(this);
 865                                if (++op_ret >= nr_wake2)
 866                                        break;
 867                        }
 868                }
 869                ret += op_ret;
 870        }
 871
 872        spin_unlock(&hb1->lock);
 873        if (hb1 != hb2)
 874                spin_unlock(&hb2->lock);
 875out:
 876        futex_unlock_mm(fshared);
 877
 878        return ret;
 879}
 880
 881/*
 882 * Requeue all waiters hashed on one physical page to another
 883 * physical page.
 884 */
 885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 886                         u32 __user *uaddr2,
 887                         int nr_wake, int nr_requeue, u32 *cmpval)
 888{
 889        union futex_key key1, key2;
 890        struct futex_hash_bucket *hb1, *hb2;
 891        struct plist_head *head1;
 892        struct futex_q *this, *next;
 893        int ret, drop_count = 0;
 894
 895 retry:
 896        futex_lock_mm(fshared);
 897
 898        ret = get_futex_key(uaddr1, fshared, &key1);
 899        if (unlikely(ret != 0))
 900                goto out;
 901        ret = get_futex_key(uaddr2, fshared, &key2);
 902        if (unlikely(ret != 0))
 903                goto out;
 904
 905        hb1 = hash_futex(&key1);
 906        hb2 = hash_futex(&key2);
 907
 908        double_lock_hb(hb1, hb2);
 909
 910        if (likely(cmpval != NULL)) {
 911                u32 curval;
 912
 913                ret = get_futex_value_locked(&curval, uaddr1);
 914
 915                if (unlikely(ret)) {
 916                        spin_unlock(&hb1->lock);
 917                        if (hb1 != hb2)
 918                                spin_unlock(&hb2->lock);
 919
 920                        /*
 921                         * If we would have faulted, release mmap_sem, fault
 922                         * it in and start all over again.
 923                         */
 924                        futex_unlock_mm(fshared);
 925
 926                        ret = get_user(curval, uaddr1);
 927
 928                        if (!ret)
 929                                goto retry;
 930
 931                        return ret;
 932                }
 933                if (curval != *cmpval) {
 934                        ret = -EAGAIN;
 935                        goto out_unlock;
 936                }
 937        }
 938
 939        head1 = &hb1->chain;
 940        plist_for_each_entry_safe(this, next, head1, list) {
 941                if (!match_futex (&this->key, &key1))
 942                        continue;
 943                if (++ret <= nr_wake) {
 944                        wake_futex(this);
 945                } else {
 946                        /*
 947                         * If key1 and key2 hash to the same bucket, no need to
 948                         * requeue.
 949                         */
 950                        if (likely(head1 != &hb2->chain)) {
 951                                plist_del(&this->list, &hb1->chain);
 952                                plist_add(&this->list, &hb2->chain);
 953                                this->lock_ptr = &hb2->lock;
 954#ifdef CONFIG_DEBUG_PI_LIST
 955                                this->list.plist.lock = &hb2->lock;
 956#endif
 957                        }
 958                        this->key = key2;
 959                        get_futex_key_refs(&key2);
 960                        drop_count++;
 961
 962                        if (ret - nr_wake >= nr_requeue)
 963                                break;
 964                }
 965        }
 966
 967out_unlock:
 968        spin_unlock(&hb1->lock);
 969        if (hb1 != hb2)
 970                spin_unlock(&hb2->lock);
 971
 972        /* drop_futex_key_refs() must be called outside the spinlocks. */
 973        while (--drop_count >= 0)
 974                drop_futex_key_refs(&key1);
 975
 976out:
 977        futex_unlock_mm(fshared);
 978        return ret;
 979}
 980
 981/* The key must be already stored in q->key. */
 982static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 983{
 984        struct futex_hash_bucket *hb;
 985
 986        init_waitqueue_head(&q->waiters);
 987
 988        get_futex_key_refs(&q->key);
 989        hb = hash_futex(&q->key);
 990        q->lock_ptr = &hb->lock;
 991
 992        spin_lock(&hb->lock);
 993        return hb;
 994}
 995
 996static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 997{
 998        int prio;
 999
1000        /*
1001         * The priority used to register this element is
1002         * - either the real thread-priority for the real-time threads
1003         * (i.e. threads with a priority lower than MAX_RT_PRIO)
1004         * - or MAX_RT_PRIO for non-RT threads.
1005         * Thus, all RT-threads are woken first in priority order, and
1006         * the others are woken last, in FIFO order.
1007         */
1008        prio = min(current->normal_prio, MAX_RT_PRIO);
1009
1010        plist_node_init(&q->list, prio);
1011#ifdef CONFIG_DEBUG_PI_LIST
1012        q->list.plist.lock = &hb->lock;
1013#endif
1014        plist_add(&q->list, &hb->chain);
1015        q->task = current;
1016        spin_unlock(&hb->lock);
1017}
1018
1019static inline void
1020queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1021{
1022        spin_unlock(&hb->lock);
1023        drop_futex_key_refs(&q->key);
1024}
1025
1026/*
1027 * queue_me and unqueue_me must be called as a pair, each
1028 * exactly once.  They are called with the hashed spinlock held.
1029 */
1030
1031/* Return 1 if we were still queued (ie. 0 means we were woken) */
1032static int unqueue_me(struct futex_q *q)
1033{
1034        spinlock_t *lock_ptr;
1035        int ret = 0;
1036
1037        /* In the common case we don't take the spinlock, which is nice. */
1038 retry:
1039        lock_ptr = q->lock_ptr;
1040        barrier();
1041        if (lock_ptr != NULL) {
1042                spin_lock(lock_ptr);
1043                /*
1044                 * q->lock_ptr can change between reading it and
1045                 * spin_lock(), causing us to take the wrong lock.  This
1046                 * corrects the race condition.
1047                 *
1048                 * Reasoning goes like this: if we have the wrong lock,
1049                 * q->lock_ptr must have changed (maybe several times)
1050                 * between reading it and the spin_lock().  It can
1051                 * change again after the spin_lock() but only if it was
1052                 * already changed before the spin_lock().  It cannot,
1053                 * however, change back to the original value.  Therefore
1054                 * we can detect whether we acquired the correct lock.
1055                 */
1056                if (unlikely(lock_ptr != q->lock_ptr)) {
1057                        spin_unlock(lock_ptr);
1058                        goto retry;
1059                }
1060                WARN_ON(plist_node_empty(&q->list));
1061                plist_del(&q->list, &q->list.plist);
1062
1063                BUG_ON(q->pi_state);
1064
1065                spin_unlock(lock_ptr);
1066                ret = 1;
1067        }
1068
1069        drop_futex_key_refs(&q->key);
1070        return ret;
1071}
1072
1073/*
1074 * PI futexes can not be requeued and must remove themself from the
1075 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1076 * and dropped here.
1077 */
1078static void unqueue_me_pi(struct futex_q *q)
1079{
1080        WARN_ON(plist_node_empty(&q->list));
1081        plist_del(&q->list, &q->list.plist);
1082
1083        BUG_ON(!q->pi_state);
1084        free_pi_state(q->pi_state);
1085        q->pi_state = NULL;
1086
1087        spin_unlock(q->lock_ptr);
1088
1089        drop_futex_key_refs(&q->key);
1090}
1091
1092/*
1093 * Fixup the pi_state owner with the new owner.
1094 *
1095 * Must be called with hash bucket lock held and mm->sem held for non
1096 * private futexes.
1097 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099                                struct task_struct *newowner,
1100                                struct rw_semaphore *fshared)
1101{
1102        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1103        struct futex_pi_state *pi_state = q->pi_state;
1104        struct task_struct *oldowner = pi_state->owner;
1105        u32 uval, curval, newval;
1106        int ret, attempt = 0;
1107
1108        /* Owner died? */
1109        if (!pi_state->owner)
1110                newtid |= FUTEX_OWNER_DIED;
1111
1112        /*
1113         * We are here either because we stole the rtmutex from the
1114         * pending owner or we are the pending owner which failed to
1115         * get the rtmutex. We have to replace the pending owner TID
1116         * in the user space variable. This must be atomic as we have
1117         * to preserve the owner died bit here.
1118         *
1119         * Note: We write the user space value _before_ changing the
1120         * pi_state because we can fault here. Imagine swapped out
1121         * pages or a fork, which was running right before we acquired
1122         * mmap_sem, that marked all the anonymous memory readonly for
1123         * cow.
1124         *
1125         * Modifying pi_state _before_ the user space value would
1126         * leave the pi_state in an inconsistent state when we fault
1127         * here, because we need to drop the hash bucket lock to
1128         * handle the fault. This might be observed in the PID check
1129         * in lookup_pi_state.
1130         */
1131retry:
1132        if (get_futex_value_locked(&uval, uaddr))
1133                goto handle_fault;
1134
1135        while (1) {
1136                newval = (uval & FUTEX_OWNER_DIED) | newtid;
1137
1138                curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1139
1140                if (curval == -EFAULT)
1141                        goto handle_fault;
1142                if (curval == uval)
1143                        break;
1144                uval = curval;
1145        }
1146
1147        /*
1148         * We fixed up user space. Now we need to fix the pi_state
1149         * itself.
1150         */
1151        if (pi_state->owner != NULL) {
1152                spin_lock_irq(&pi_state->owner->pi_lock);
1153                WARN_ON(list_empty(&pi_state->list));
1154                list_del_init(&pi_state->list);
1155                spin_unlock_irq(&pi_state->owner->pi_lock);
1156        }
1157
1158        pi_state->owner = newowner;
1159
1160        spin_lock_irq(&newowner->pi_lock);
1161        WARN_ON(!list_empty(&pi_state->list));
1162        list_add(&pi_state->list, &newowner->pi_state_list);
1163        spin_unlock_irq(&newowner->pi_lock);
1164        return 0;
1165
1166        /*
1167         * To handle the page fault we need to drop the hash bucket
1168         * lock here. That gives the other task (either the pending
1169         * owner itself or the task which stole the rtmutex) the
1170         * chance to try the fixup of the pi_state. So once we are
1171         * back from handling the fault we need to check the pi_state
1172         * after reacquiring the hash bucket lock and before trying to
1173         * do another fixup. When the fixup has been done already we
1174         * simply return.
1175         */
1176handle_fault:
1177        spin_unlock(q->lock_ptr);
1178
1179        ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
1180
1181        spin_lock(q->lock_ptr);
1182
1183        /*
1184         * Check if someone else fixed it for us:
1185         */
1186        if (pi_state->owner != oldowner)
1187                return 0;
1188
1189        if (ret)
1190                return ret;
1191
1192        goto retry;
1193}
1194
1195/*
1196 * In case we must use restart_block to restart a futex_wait,
1197 * we encode in the 'flags' shared capability
1198 */
1199#define FLAGS_SHARED  1
1200
1201static long futex_wait_restart(struct restart_block *restart);
1202
1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1204                      u32 val, ktime_t *abs_time, u32 bitset)
1205{
1206        struct task_struct *curr = current;
1207        DECLARE_WAITQUEUE(wait, curr);
1208        struct futex_hash_bucket *hb;
1209        struct futex_q q;
1210        u32 uval;
1211        int ret;
1212        struct hrtimer_sleeper t;
1213        int rem = 0;
1214
1215        if (!bitset)
1216                return -EINVAL;
1217
1218        q.pi_state = NULL;
1219        q.bitset = bitset;
1220 retry:
1221        futex_lock_mm(fshared);
1222
1223        ret = get_futex_key(uaddr, fshared, &q.key);
1224        if (unlikely(ret != 0))
1225                goto out_release_sem;
1226
1227        hb = queue_lock(&q);
1228
1229        /*
1230         * Access the page AFTER the futex is queued.
1231         * Order is important:
1232         *
1233         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1234         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
1235         *
1236         * The basic logical guarantee of a futex is that it blocks ONLY
1237         * if cond(var) is known to be true at the time of blocking, for
1238         * any cond.  If we queued after testing *uaddr, that would open
1239         * a race condition where we could block indefinitely with
1240         * cond(var) false, which would violate the guarantee.
1241         *
1242         * A consequence is that futex_wait() can return zero and absorb
1243         * a wakeup when *uaddr != val on entry to the syscall.  This is
1244         * rare, but normal.
1245         *
1246         * for shared futexes, we hold the mmap semaphore, so the mapping
1247         * cannot have changed since we looked it up in get_futex_key.
1248         */
1249        ret = get_futex_value_locked(&uval, uaddr);
1250
1251        if (unlikely(ret)) {
1252                queue_unlock(&q, hb);
1253
1254                /*
1255                 * If we would have faulted, release mmap_sem, fault it in and
1256                 * start all over again.
1257                 */
1258                futex_unlock_mm(fshared);
1259
1260                ret = get_user(uval, uaddr);
1261
1262                if (!ret)
1263                        goto retry;
1264                return ret;
1265        }
1266        ret = -EWOULDBLOCK;
1267        if (uval != val)
1268                goto out_unlock_release_sem;
1269
1270        /* Only actually queue if *uaddr contained val.  */
1271        queue_me(&q, hb);
1272
1273        /*
1274         * Now the futex is queued and we have checked the data, we
1275         * don't want to hold mmap_sem while we sleep.
1276         */
1277        futex_unlock_mm(fshared);
1278
1279        /*
1280         * There might have been scheduling since the queue_me(), as we
1281         * cannot hold a spinlock across the get_user() in case it
1282         * faults, and we cannot just set TASK_INTERRUPTIBLE state when
1283         * queueing ourselves into the futex hash.  This code thus has to
1284         * rely on the futex_wake() code removing us from hash when it
1285         * wakes us up.
1286         */
1287
1288        /* add_wait_queue is the barrier after __set_current_state. */
1289        __set_current_state(TASK_INTERRUPTIBLE);
1290        add_wait_queue(&q.waiters, &wait);
1291        /*
1292         * !plist_node_empty() is safe here without any lock.
1293         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1294         */
1295        if (likely(!plist_node_empty(&q.list))) {
1296                if (!abs_time)
1297                        schedule();
1298                else {
1299                        unsigned long slack;
1300                        slack = current->timer_slack_ns;
1301                        if (rt_task(current))
1302                                slack = 0;
1303                        hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
1304                                                HRTIMER_MODE_ABS);
1305                        hrtimer_init_sleeper(&t, current);
1306                        hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1307
1308                        hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1309                        if (!hrtimer_active(&t.timer))
1310                                t.task = NULL;
1311
1312                        /*
1313                         * the timer could have already expired, in which
1314                         * case current would be flagged for rescheduling.
1315                         * Don't bother calling schedule.
1316                         */
1317                        if (likely(t.task))
1318                                schedule();
1319
1320                        hrtimer_cancel(&t.timer);
1321
1322                        /* Flag if a timeout occured */
1323                        rem = (t.task == NULL);
1324
1325                        destroy_hrtimer_on_stack(&t.timer);
1326                }
1327        }
1328        __set_current_state(TASK_RUNNING);
1329
1330        /*
1331         * NOTE: we don't remove ourselves from the waitqueue because
1332         * we are the only user of it.
1333         */
1334
1335        /* If we were woken (and unqueued), we succeeded, whatever. */
1336        if (!unqueue_me(&q))
1337                return 0;
1338        if (rem)
1339                return -ETIMEDOUT;
1340
1341        /*
1342         * We expect signal_pending(current), but another thread may
1343         * have handled it for us already.
1344         */
1345        if (!abs_time)
1346                return -ERESTARTSYS;
1347        else {
1348                struct restart_block *restart;
1349                restart = &current_thread_info()->restart_block;
1350                restart->fn = futex_wait_restart;
1351                restart->futex.uaddr = (u32 *)uaddr;
1352                restart->futex.val = val;
1353                restart->futex.time = abs_time->tv64;
1354                restart->futex.bitset = bitset;
1355                restart->futex.flags = 0;
1356
1357                if (fshared)
1358                        restart->futex.flags |= FLAGS_SHARED;
1359                return -ERESTART_RESTARTBLOCK;
1360        }
1361
1362 out_unlock_release_sem:
1363        queue_unlock(&q, hb);
1364
1365 out_release_sem:
1366        futex_unlock_mm(fshared);
1367        return ret;
1368}
1369
1370
1371static long futex_wait_restart(struct restart_block *restart)
1372{
1373        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1374        struct rw_semaphore *fshared = NULL;
1375        ktime_t t;
1376
1377        t.tv64 = restart->futex.time;
1378        restart->fn = do_no_restart_syscall;
1379        if (restart->futex.flags & FLAGS_SHARED)
1380                fshared = &current->mm->mmap_sem;
1381        return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1382                                restart->futex.bitset);
1383}
1384
1385
1386/*
1387 * Userspace tried a 0 -> TID atomic transition of the futex value
1388 * and failed. The kernel side here does the whole locking operation:
1389 * if there are waiters then it will block, it does PI, etc. (Due to
1390 * races the kernel might see a 0 value of the futex too.)
1391 */
1392static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1393                         int detect, ktime_t *time, int trylock)
1394{
1395        struct hrtimer_sleeper timeout, *to = NULL;
1396        struct task_struct *curr = current;
1397        struct futex_hash_bucket *hb;
1398        u32 uval, newval, curval;
1399        struct futex_q q;
1400        int ret, lock_taken, ownerdied = 0, attempt = 0;
1401
1402        if (refill_pi_state_cache())
1403                return -ENOMEM;
1404
1405        if (time) {
1406                to = &timeout;
1407                hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1408                                      HRTIMER_MODE_ABS);
1409                hrtimer_init_sleeper(to, current);
1410                hrtimer_set_expires(&to->timer, *time);
1411        }
1412
1413        q.pi_state = NULL;
1414 retry:
1415        futex_lock_mm(fshared);
1416
1417        ret = get_futex_key(uaddr, fshared, &q.key);
1418        if (unlikely(ret != 0))
1419                goto out_release_sem;
1420
1421 retry_unlocked:
1422        hb = queue_lock(&q);
1423
1424 retry_locked:
1425        ret = lock_taken = 0;
1426
1427        /*
1428         * To avoid races, we attempt to take the lock here again
1429         * (by doing a 0 -> TID atomic cmpxchg), while holding all
1430         * the locks. It will most likely not succeed.
1431         */
1432        newval = task_pid_vnr(current);
1433
1434        curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1435
1436        if (unlikely(curval == -EFAULT))
1437                goto uaddr_faulted;
1438
1439        /*
1440         * Detect deadlocks. In case of REQUEUE_PI this is a valid
1441         * situation and we return success to user space.
1442         */
1443        if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1444                ret = -EDEADLK;
1445                goto out_unlock_release_sem;
1446        }
1447
1448        /*
1449         * Surprise - we got the lock. Just return to userspace:
1450         */
1451        if (unlikely(!curval))
1452                goto out_unlock_release_sem;
1453
1454        uval = curval;
1455
1456        /*
1457         * Set the WAITERS flag, so the owner will know it has someone
1458         * to wake at next unlock
1459         */
1460        newval = curval | FUTEX_WAITERS;
1461
1462        /*
1463         * There are two cases, where a futex might have no owner (the
1464         * owner TID is 0): OWNER_DIED. We take over the futex in this
1465         * case. We also do an unconditional take over, when the owner
1466         * of the futex died.
1467         *
1468         * This is safe as we are protected by the hash bucket lock !
1469         */
1470        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1471                /* Keep the OWNER_DIED bit */
1472                newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1473                ownerdied = 0;
1474                lock_taken = 1;
1475        }
1476
1477        curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1478
1479        if (unlikely(curval == -EFAULT))
1480                goto uaddr_faulted;
1481        if (unlikely(curval != uval))
1482                goto retry_locked;
1483
1484        /*
1485         * We took the lock due to owner died take over.
1486         */
1487        if (unlikely(lock_taken))
1488                goto out_unlock_release_sem;
1489
1490        /*
1491         * We dont have the lock. Look up the PI state (or create it if
1492         * we are the first waiter):
1493         */
1494        ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1495
1496        if (unlikely(ret)) {
1497                switch (ret) {
1498
1499                case -EAGAIN:
1500                        /*
1501                         * Task is exiting and we just wait for the
1502                         * exit to complete.
1503                         */
1504                        queue_unlock(&q, hb);
1505                        futex_unlock_mm(fshared);
1506                        cond_resched();
1507                        goto retry;
1508
1509                case -ESRCH:
1510                        /*
1511                         * No owner found for this futex. Check if the
1512                         * OWNER_DIED bit is set to figure out whether
1513                         * this is a robust futex or not.
1514                         */
1515                        if (get_futex_value_locked(&curval, uaddr))
1516                                goto uaddr_faulted;
1517
1518                        /*
1519                         * We simply start over in case of a robust
1520                         * futex. The code above will take the futex
1521                         * and return happy.
1522                         */
1523                        if (curval & FUTEX_OWNER_DIED) {
1524                                ownerdied = 1;
1525                                goto retry_locked;
1526                        }
1527                default:
1528                        goto out_unlock_release_sem;
1529                }
1530        }
1531
1532        /*
1533         * Only actually queue now that the atomic ops are done:
1534         */
1535        queue_me(&q, hb);
1536
1537        /*
1538         * Now the futex is queued and we have checked the data, we
1539         * don't want to hold mmap_sem while we sleep.
1540         */
1541        futex_unlock_mm(fshared);
1542
1543        WARN_ON(!q.pi_state);
1544        /*
1545         * Block on the PI mutex:
1546         */
1547        if (!trylock)
1548                ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1549        else {
1550                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1551                /* Fixup the trylock return value: */
1552                ret = ret ? 0 : -EWOULDBLOCK;
1553        }
1554
1555        futex_lock_mm(fshared);
1556        spin_lock(q.lock_ptr);
1557
1558        if (!ret) {
1559                /*
1560                 * Got the lock. We might not be the anticipated owner
1561                 * if we did a lock-steal - fix up the PI-state in
1562                 * that case:
1563                 */
1564                if (q.pi_state->owner != curr)
1565                        ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
1566        } else {
1567                /*
1568                 * Catch the rare case, where the lock was released
1569                 * when we were on the way back before we locked the
1570                 * hash bucket.
1571                 */
1572                if (q.pi_state->owner == curr) {
1573                        /*
1574                         * Try to get the rt_mutex now. This might
1575                         * fail as some other task acquired the
1576                         * rt_mutex after we removed ourself from the
1577                         * rt_mutex waiters list.
1578                         */
1579                        if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1580                                ret = 0;
1581                        else {
1582                                /*
1583                                 * pi_state is incorrect, some other
1584                                 * task did a lock steal and we
1585                                 * returned due to timeout or signal
1586                                 * without taking the rt_mutex. Too
1587                                 * late. We can access the
1588                                 * rt_mutex_owner without locking, as
1589                                 * the other task is now blocked on
1590                                 * the hash bucket lock. Fix the state
1591                                 * up.
1592                                 */
1593                                struct task_struct *owner;
1594                                int res;
1595
1596                                owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1597                                res = fixup_pi_state_owner(uaddr, &q, owner,
1598                                                           fshared);
1599
1600                                /* propagate -EFAULT, if the fixup failed */
1601                                if (res)
1602                                        ret = res;
1603                        }
1604                } else {
1605                        /*
1606                         * Paranoia check. If we did not take the lock
1607                         * in the trylock above, then we should not be
1608                         * the owner of the rtmutex, neither the real
1609                         * nor the pending one:
1610                         */
1611                        if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
1612                                printk(KERN_ERR "futex_lock_pi: ret = %d "
1613                                       "pi-mutex: %p pi-state %p\n", ret,
1614                                       q.pi_state->pi_mutex.owner,
1615                                       q.pi_state->owner);
1616                }
1617        }
1618
1619        /* Unqueue and drop the lock */
1620        unqueue_me_pi(&q);
1621        futex_unlock_mm(fshared);
1622
1623        if (to)
1624                destroy_hrtimer_on_stack(&to->timer);
1625        return ret != -EINTR ? ret : -ERESTARTNOINTR;
1626
1627 out_unlock_release_sem:
1628        queue_unlock(&q, hb);
1629
1630 out_release_sem:
1631        futex_unlock_mm(fshared);
1632        if (to)
1633                destroy_hrtimer_on_stack(&to->timer);
1634        return ret;
1635
1636 uaddr_faulted:
1637        /*
1638         * We have to r/w  *(int __user *)uaddr, but we can't modify it
1639         * non-atomically.  Therefore, if get_user below is not
1640         * enough, we need to handle the fault ourselves, while
1641         * still holding the mmap_sem.
1642         *
1643         * ... and hb->lock. :-) --ANK
1644         */
1645        queue_unlock(&q, hb);
1646
1647        if (attempt++) {
1648                ret = futex_handle_fault((unsigned long)uaddr, fshared,
1649                                         attempt);
1650                if (ret)
1651                        goto out_release_sem;
1652                goto retry_unlocked;
1653        }
1654
1655        futex_unlock_mm(fshared);
1656
1657        ret = get_user(uval, uaddr);
1658        if (!ret && (uval != -EFAULT))
1659                goto retry;
1660
1661        if (to)
1662                destroy_hrtimer_on_stack(&to->timer);
1663        return ret;
1664}
1665
1666/*
1667 * Userspace attempted a TID -> 0 atomic transition, and failed.
1668 * This is the in-kernel slowpath: we look up the PI state (if any),
1669 * and do the rt-mutex unlock.
1670 */
1671static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
1672{
1673        struct futex_hash_bucket *hb;
1674        struct futex_q *this, *next;
1675        u32 uval;
1676        struct plist_head *head;
1677        union futex_key key;
1678        int ret, attempt = 0;
1679
1680retry:
1681        if (get_user(uval, uaddr))
1682                return -EFAULT;
1683        /*
1684         * We release only a lock we actually own:
1685         */
1686        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1687                return -EPERM;
1688        /*
1689         * First take all the futex related locks:
1690         */
1691        futex_lock_mm(fshared);
1692
1693        ret = get_futex_key(uaddr, fshared, &key);
1694        if (unlikely(ret != 0))
1695                goto out;
1696
1697        hb = hash_futex(&key);
1698retry_unlocked:
1699        spin_lock(&hb->lock);
1700
1701        /*
1702         * To avoid races, try to do the TID -> 0 atomic transition
1703         * again. If it succeeds then we can return without waking
1704         * anyone else up:
1705         */
1706        if (!(uval & FUTEX_OWNER_DIED))
1707                uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
1708
1709
1710        if (unlikely(uval == -EFAULT))
1711                goto pi_faulted;
1712        /*
1713         * Rare case: we managed to release the lock atomically,
1714         * no need to wake anyone else up:
1715         */
1716        if (unlikely(uval == task_pid_vnr(current)))
1717                goto out_unlock;
1718
1719        /*
1720         * Ok, other tasks may need to be woken up - check waiters
1721         * and do the wakeup if necessary:
1722         */
1723        head = &hb->chain;
1724
1725        plist_for_each_entry_safe(this, next, head, list) {
1726                if (!match_futex (&this->key, &key))
1727                        continue;
1728                ret = wake_futex_pi(uaddr, uval, this);
1729                /*
1730                 * The atomic access to the futex value
1731                 * generated a pagefault, so retry the
1732                 * user-access and the wakeup:
1733                 */
1734                if (ret == -EFAULT)
1735                        goto pi_faulted;
1736                goto out_unlock;
1737        }
1738        /*
1739         * No waiters - kernel unlocks the futex:
1740         */
1741        if (!(uval & FUTEX_OWNER_DIED)) {
1742                ret = unlock_futex_pi(uaddr, uval);
1743                if (ret == -EFAULT)
1744                        goto pi_faulted;
1745        }
1746
1747out_unlock:
1748        spin_unlock(&hb->lock);
1749out:
1750        futex_unlock_mm(fshared);
1751
1752        return ret;
1753
1754pi_faulted:
1755        /*
1756         * We have to r/w  *(int __user *)uaddr, but we can't modify it
1757         * non-atomically.  Therefore, if get_user below is not
1758         * enough, we need to handle the fault ourselves, while
1759         * still holding the mmap_sem.
1760         *
1761         * ... and hb->lock. --ANK
1762         */
1763        spin_unlock(&hb->lock);
1764
1765        if (attempt++) {
1766                ret = futex_handle_fault((unsigned long)uaddr, fshared,
1767                                         attempt);
1768                if (ret)
1769                        goto out;
1770                uval = 0;
1771                goto retry_unlocked;
1772        }
1773
1774        futex_unlock_mm(fshared);
1775
1776        ret = get_user(uval, uaddr);
1777        if (!ret && (uval != -EFAULT))
1778                goto retry;
1779
1780        return ret;
1781}
1782
1783/*
1784 * Support for robust futexes: the kernel cleans up held futexes at
1785 * thread exit time.
1786 *
1787 * Implementation: user-space maintains a per-thread list of locks it
1788 * is holding. Upon do_exit(), the kernel carefully walks this list,
1789 * and marks all locks that are owned by this thread with the
1790 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
1791 * always manipulated with the lock held, so the list is private and
1792 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
1793 * field, to allow the kernel to clean up if the thread dies after
1794 * acquiring the lock, but just before it could have added itself to
1795 * the list. There can only be one such pending lock.
1796 */
1797
1798/**
1799 * sys_set_robust_list - set the robust-futex list head of a task
1800 * @head: pointer to the list-head
1801 * @len: length of the list-head, as userspace expects
1802 */
1803asmlinkage long
1804sys_set_robust_list(struct robust_list_head __user *head,
1805                    size_t len)
1806{
1807        if (!futex_cmpxchg_enabled)
1808                return -ENOSYS;
1809        /*
1810         * The kernel knows only one size for now:
1811         */
1812        if (unlikely(len != sizeof(*head)))
1813                return -EINVAL;
1814
1815        current->robust_list = head;
1816
1817        return 0;
1818}
1819
1820/**
1821 * sys_get_robust_list - get the robust-futex list head of a task
1822 * @pid: pid of the process [zero for current task]
1823 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
1824 * @len_ptr: pointer to a length field, the kernel fills in the header size
1825 */
1826asmlinkage long
1827sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1828                    size_t __user *len_ptr)
1829{
1830        struct robust_list_head __user *head;
1831        unsigned long ret;
1832
1833        if (!futex_cmpxchg_enabled)
1834                return -ENOSYS;
1835
1836        if (!pid)
1837                head = current->robust_list;
1838        else {
1839                struct task_struct *p;
1840
1841                ret = -ESRCH;
1842                rcu_read_lock();
1843                p = find_task_by_vpid(pid);
1844                if (!p)
1845                        goto err_unlock;
1846                ret = -EPERM;
1847                if ((current->euid != p->euid) && (current->euid != p->uid) &&
1848                                !capable(CAP_SYS_PTRACE))
1849                        goto err_unlock;
1850                head = p->robust_list;
1851                rcu_read_unlock();
1852        }
1853
1854        if (put_user(sizeof(*head), len_ptr))
1855                return -EFAULT;
1856        return put_user(head, head_ptr);
1857
1858err_unlock:
1859        rcu_read_unlock();
1860
1861        return ret;
1862}
1863
1864/*
1865 * Process a futex-list entry, check whether it's owned by the
1866 * dying task, and do notification if so:
1867 */
1868int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
1869{
1870        u32 uval, nval, mval;
1871
1872retry:
1873        if (get_user(uval, uaddr))
1874                return -1;
1875
1876        if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
1877                /*
1878                 * Ok, this dying thread is truly holding a futex
1879                 * of interest. Set the OWNER_DIED bit atomically
1880                 * via cmpxchg, and if the value had FUTEX_WAITERS
1881                 * set, wake up a waiter (if any). (We have to do a
1882                 * futex_wake() even if OWNER_DIED is already set -
1883                 * to handle the rare but possible case of recursive
1884                 * thread-death.) The rest of the cleanup is done in
1885                 * userspace.
1886                 */
1887                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
1888                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1889
1890                if (nval == -EFAULT)
1891                        return -1;
1892
1893                if (nval != uval)
1894                        goto retry;
1895
1896                /*
1897                 * Wake robust non-PI futexes here. The wakeup of
1898                 * PI futexes happens in exit_pi_state():
1899                 */
1900                if (!pi && (uval & FUTEX_WAITERS))
1901                        futex_wake(uaddr, &curr->mm->mmap_sem, 1,
1902                                   FUTEX_BITSET_MATCH_ANY);
1903        }
1904        return 0;
1905}
1906
1907/*
1908 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
1909 */
1910static inline int fetch_robust_entry(struct robust_list __user **entry,
1911                                     struct robust_list __user * __user *head,
1912                                     int *pi)
1913{
1914        unsigned long uentry;
1915
1916        if (get_user(uentry, (unsigned long __user *)head))
1917                return -EFAULT;
1918
1919        *entry = (void __user *)(uentry & ~1UL);
1920        *pi = uentry & 1;
1921
1922        return 0;
1923}
1924
1925/*
1926 * Walk curr->robust_list (very carefully, it's a userspace list!)
1927 * and mark any locks found there dead, and notify any waiters.
1928 *
1929 * We silently return on any sign of list-walking problem.
1930 */
1931void exit_robust_list(struct task_struct *curr)
1932{
1933        struct robust_list_head __user *head = curr->robust_list;
1934        struct robust_list __user *entry, *next_entry, *pending;
1935        unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
1936        unsigned long futex_offset;
1937        int rc;
1938
1939        if (!futex_cmpxchg_enabled)
1940                return;
1941
1942        /*
1943         * Fetch the list head (which was registered earlier, via
1944         * sys_set_robust_list()):
1945         */
1946        if (fetch_robust_entry(&entry, &head->list.next, &pi))
1947                return;
1948        /*
1949         * Fetch the relative futex offset:
1950         */
1951        if (get_user(futex_offset, &head->futex_offset))
1952                return;
1953        /*
1954         * Fetch any possibly pending lock-add first, and handle it
1955         * if it exists:
1956         */
1957        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
1958                return;
1959
1960        next_entry = NULL;        /* avoid warning with gcc */
1961        while (entry != &head->list) {
1962                /*
1963                 * Fetch the next entry in the list before calling
1964                 * handle_futex_death:
1965                 */
1966                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
1967                /*
1968                 * A pending lock might already be on the list, so
1969                 * don't process it twice:
1970                 */
1971                if (entry != pending)
1972                        if (handle_futex_death((void __user *)entry + futex_offset,
1973                                                curr, pi))
1974                                return;
1975                if (rc)
1976                        return;
1977                entry = next_entry;
1978                pi = next_pi;
1979                /*
1980                 * Avoid excessively long or circular lists:
1981                 */
1982                if (!--limit)
1983                        break;
1984
1985                cond_resched();
1986        }
1987
1988        if (pending)
1989                handle_futex_death((void __user *)pending + futex_offset,
1990                                   curr, pip);
1991}
1992
1993long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1994                u32 __user *uaddr2, u32 val2, u32 val3)
1995{
1996        int ret = -ENOSYS;
1997        int cmd = op & FUTEX_CMD_MASK;
1998        struct rw_semaphore *fshared = NULL;
1999
2000        if (!(op & FUTEX_PRIVATE_FLAG))
2001                fshared = &current->mm->mmap_sem;
2002
2003        switch (cmd) {
2004        case FUTEX_WAIT:
2005                val3 = FUTEX_BITSET_MATCH_ANY;
2006        case FUTEX_WAIT_BITSET:
2007                ret = futex_wait(uaddr, fshared, val, timeout, val3);
2008                break;
2009        case FUTEX_WAKE:
2010                val3 = FUTEX_BITSET_MATCH_ANY;
2011        case FUTEX_WAKE_BITSET:
2012                ret = futex_wake(uaddr, fshared, val, val3);
2013                break;
2014        case FUTEX_REQUEUE:
2015                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
2016                break;
2017        case FUTEX_CMP_REQUEUE:
2018                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
2019                break;
2020        case FUTEX_WAKE_OP:
2021                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
2022                break;
2023        case FUTEX_LOCK_PI:
2024                if (futex_cmpxchg_enabled)
2025                        ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
2026                break;
2027        case FUTEX_UNLOCK_PI:
2028                if (futex_cmpxchg_enabled)
2029                        ret = futex_unlock_pi(uaddr, fshared);
2030                break;
2031        case FUTEX_TRYLOCK_PI:
2032                if (futex_cmpxchg_enabled)
2033                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2034                break;
2035        default:
2036                ret = -ENOSYS;
2037        }
2038        return ret;
2039}
2040
2041
2042asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2043                          struct timespec __user *utime, u32 __user *uaddr2,
2044                          u32 val3)
2045{
2046        struct timespec ts;
2047        ktime_t t, *tp = NULL;
2048        u32 val2 = 0;
2049        int cmd = op & FUTEX_CMD_MASK;
2050
2051        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2052                      cmd == FUTEX_WAIT_BITSET)) {
2053                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2054                        return -EFAULT;
2055                if (!timespec_valid(&ts))
2056                        return -EINVAL;
2057
2058                t = timespec_to_ktime(ts);
2059                if (cmd == FUTEX_WAIT)
2060                        t = ktime_add_safe(ktime_get(), t);
2061                tp = &t;
2062        }
2063        /*
2064         * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
2065         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2066         */
2067        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2068            cmd == FUTEX_WAKE_OP)
2069                val2 = (u32) (unsigned long) utime;
2070
2071        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2072}
2073
2074static int __init futex_init(void)
2075{
2076        u32 curval;
2077        int i;
2078
2079        /*
2080         * This will fail and we want it. Some arch implementations do
2081         * runtime detection of the futex_atomic_cmpxchg_inatomic()
2082         * functionality. We want to know that before we call in any
2083         * of the complex code paths. Also we want to prevent
2084         * registration of robust lists in that case. NULL is
2085         * guaranteed to fault and we get -EFAULT on functional
2086         * implementation, the non functional ones will return
2087         * -ENOSYS.
2088         */
2089        curval = cmpxchg_futex_value_locked(NULL, 0, 0);
2090        if (curval == -EFAULT)
2091                futex_cmpxchg_enabled = 1;
2092
2093        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2094                plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
2095                spin_lock_init(&futex_queues[i].lock);
2096        }
2097
2098        return 0;
2099}
2100__initcall(futex_init);