Showing error 1882

User: Jiri Slaby
Error type: Invalid Pointer Dereference
Error type description: A pointer which is invalid is being dereferenced
File location: net/core/sock.c
Line in file: 2100
Project: Linux Kernel
Project version: 2.6.28
Confirmation: Fixed by 72150e9b7fec217fbd646a29ea2f65a3d4d55ea9
Tools: Smatch (1.59)
Entered: 2013-09-11 08:47:26 UTC


Source:

   1/*
   2 * INET                An implementation of the TCP/IP protocol suite for the LINUX
   3 *                operating system.  INET is implemented using the  BSD Socket
   4 *                interface as the means of communication with the user level.
   5 *
   6 *                Generic socket support routines. Memory allocators, socket lock/release
   7 *                handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:        Ross Biro
  11 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                Florian La Roche, <flla@stud.uni-sb.de>
  13 *                Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *                Alan Cox        :         Numerous verify_area() problems
  17 *                Alan Cox        :        Connecting on a connecting socket
  18 *                                        now returns an error for tcp.
  19 *                Alan Cox        :        sock->protocol is set correctly.
  20 *                                        and is not sometimes left as 0.
  21 *                Alan Cox        :        connect handles icmp errors on a
  22 *                                        connect properly. Unfortunately there
  23 *                                        is a restart syscall nasty there. I
  24 *                                        can't match BSD without hacking the C
  25 *                                        library. Ideas urgently sought!
  26 *                Alan Cox        :        Disallow bind() to addresses that are
  27 *                                        not ours - especially broadcast ones!!
  28 *                Alan Cox        :        Socket 1024 _IS_ ok for users. (fencepost)
  29 *                Alan Cox        :        sock_wfree/sock_rfree don't destroy sockets,
  30 *                                        instead they leave that for the DESTROY timer.
  31 *                Alan Cox        :        Clean up error flag in accept
  32 *                Alan Cox        :        TCP ack handling is buggy, the DESTROY timer
  33 *                                        was buggy. Put a remove_sock() in the handler
  34 *                                        for memory when we hit 0. Also altered the timer
  35 *                                        code. The ACK stuff can wait and needs major
  36 *                                        TCP layer surgery.
  37 *                Alan Cox        :        Fixed TCP ack bug, removed remove sock
  38 *                                        and fixed timer/inet_bh race.
  39 *                Alan Cox        :        Added zapped flag for TCP
  40 *                Alan Cox        :        Move kfree_skb into skbuff.c and tidied up surplus code
  41 *                Alan Cox        :        for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *                Alan Cox        :        kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *                Alan Cox        :        Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *                Alan Cox        :        Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *                Rick Sladkey        :        Relaxed UDP rules for matching packets.
  46 *                C.E.Hawkins        :        IFF_PROMISC/SIOCGHWADDR support
  47 *        Pauline Middelink        :        identd support
  48 *                Alan Cox        :        Fixed connect() taking signals I think.
  49 *                Alan Cox        :        SO_LINGER supported
  50 *                Alan Cox        :        Error reporting fixes
  51 *                Anonymous        :        inet_create tidied up (sk->reuse setting)
  52 *                Alan Cox        :        inet sockets don't set sk->type!
  53 *                Alan Cox        :        Split socket option code
  54 *                Alan Cox        :        Callbacks
  55 *                Alan Cox        :        Nagle flag for Charles & Johannes stuff
  56 *                Alex                :        Removed restriction on inet fioctl
  57 *                Alan Cox        :        Splitting INET from NET core
  58 *                Alan Cox        :        Fixed bogus SO_TYPE handling in getsockopt()
  59 *                Adam Caldwell        :        Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *                Alan Cox        :        Split IP from generic code
  61 *                Alan Cox        :        New kfree_skbmem()
  62 *                Alan Cox        :        Make SO_DEBUG superuser only.
  63 *                Alan Cox        :        Allow anyone to clear SO_DEBUG
  64 *                                        (compatibility fix)
  65 *                Alan Cox        :        Added optimistic memory grabbing for AF_UNIX throughput.
  66 *                Alan Cox        :        Allocator for a socket is settable.
  67 *                Alan Cox        :        SO_ERROR includes soft errors.
  68 *                Alan Cox        :        Allow NULL arguments on some SO_ opts
  69 *                Alan Cox        :         Generic socket allocation to make hooks
  70 *                                        easier (suggested by Craig Metz).
  71 *                Michael Pall        :        SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *                Jay Schulist        :        Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *                Andi Kleen        :        Add sock_kmalloc()/sock_kfree_s()
  79 *                Andi Kleen        :        Fix write_space callback
  80 *                Chris Evans        :        Security fixes - signedness again
  81 *                Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *                This program is free software; you can redistribute it and/or
  87 *                modify it under the terms of the GNU General Public License
  88 *                as published by the Free Software Foundation; either version
  89 *                2 of the License, or (at your option) any later version.
  90 */
  91
  92#include <linux/capability.h>
  93#include <linux/errno.h>
  94#include <linux/types.h>
  95#include <linux/socket.h>
  96#include <linux/in.h>
  97#include <linux/kernel.h>
  98#include <linux/module.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/sched.h>
 102#include <linux/timer.h>
 103#include <linux/string.h>
 104#include <linux/sockios.h>
 105#include <linux/net.h>
 106#include <linux/mm.h>
 107#include <linux/slab.h>
 108#include <linux/interrupt.h>
 109#include <linux/poll.h>
 110#include <linux/tcp.h>
 111#include <linux/init.h>
 112#include <linux/highmem.h>
 113
 114#include <asm/uaccess.h>
 115#include <asm/system.h>
 116
 117#include <linux/netdevice.h>
 118#include <net/protocol.h>
 119#include <linux/skbuff.h>
 120#include <net/net_namespace.h>
 121#include <net/request_sock.h>
 122#include <net/sock.h>
 123#include <net/xfrm.h>
 124#include <linux/ipsec.h>
 125
 126#include <linux/filter.h>
 127
 128#ifdef CONFIG_INET
 129#include <net/tcp.h>
 130#endif
 131
 132/*
 133 * Each address family might have different locking rules, so we have
 134 * one slock key per address family:
 135 */
 136static struct lock_class_key af_family_keys[AF_MAX];
 137static struct lock_class_key af_family_slock_keys[AF_MAX];
 138
 139/*
 140 * Make lock validator output more readable. (we pre-construct these
 141 * strings build-time, so that runtime initialization of socket
 142 * locks is fast):
 143 */
 144static const char *af_family_key_strings[AF_MAX+1] = {
 145  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 146  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 147  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 148  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 149  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 150  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 151  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 152  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 153  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 154  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 155  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 156  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 157  "sk_lock-AF_MAX"
 158};
 159static const char *af_family_slock_key_strings[AF_MAX+1] = {
 160  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 161  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 162  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 163  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 164  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 165  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 166  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 167  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 168  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 169  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 170  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 171  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 172  "slock-AF_MAX"
 173};
 174static const char *af_family_clock_key_strings[AF_MAX+1] = {
 175  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 176  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 177  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 178  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 179  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 180  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 181  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 182  "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 183  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 184  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 185  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 186  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 187  "clock-AF_MAX"
 188};
 189
 190/*
 191 * sk_callback_lock locking rules are per-address-family,
 192 * so split the lock classes by using a per-AF key:
 193 */
 194static struct lock_class_key af_callback_keys[AF_MAX];
 195
 196/* Take into consideration the size of the struct sk_buff overhead in the
 197 * determination of these values, since that is non-constant across
 198 * platforms.  This makes socket queueing behavior and performance
 199 * not depend upon such differences.
 200 */
 201#define _SK_MEM_PACKETS                256
 202#define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 203#define SK_WMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 204#define SK_RMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 205
 206/* Run time adjustable parameters. */
 207__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 208__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 209__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 210__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 211
 212/* Maximal space eaten by iovec or ancilliary data plus some space */
 213int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 214
 215static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 216{
 217        struct timeval tv;
 218
 219        if (optlen < sizeof(tv))
 220                return -EINVAL;
 221        if (copy_from_user(&tv, optval, sizeof(tv)))
 222                return -EFAULT;
 223        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 224                return -EDOM;
 225
 226        if (tv.tv_sec < 0) {
 227                static int warned __read_mostly;
 228
 229                *timeo_p = 0;
 230                if (warned < 10 && net_ratelimit()) {
 231                        warned++;
 232                        printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 233                               "tries to set negative timeout\n",
 234                                current->comm, task_pid_nr(current));
 235                }
 236                return 0;
 237        }
 238        *timeo_p = MAX_SCHEDULE_TIMEOUT;
 239        if (tv.tv_sec == 0 && tv.tv_usec == 0)
 240                return 0;
 241        if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 242                *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 243        return 0;
 244}
 245
 246static void sock_warn_obsolete_bsdism(const char *name)
 247{
 248        static int warned;
 249        static char warncomm[TASK_COMM_LEN];
 250        if (strcmp(warncomm, current->comm) && warned < 5) {
 251                strcpy(warncomm,  current->comm);
 252                printk(KERN_WARNING "process `%s' is using obsolete "
 253                       "%s SO_BSDCOMPAT\n", warncomm, name);
 254                warned++;
 255        }
 256}
 257
 258static void sock_disable_timestamp(struct sock *sk)
 259{
 260        if (sock_flag(sk, SOCK_TIMESTAMP)) {
 261                sock_reset_flag(sk, SOCK_TIMESTAMP);
 262                net_disable_timestamp();
 263        }
 264}
 265
 266
 267int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 268{
 269        int err = 0;
 270        int skb_len;
 271
 272        /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 273           number of warnings when compiling with -W --ANK
 274         */
 275        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 276            (unsigned)sk->sk_rcvbuf) {
 277                err = -ENOMEM;
 278                goto out;
 279        }
 280
 281        err = sk_filter(sk, skb);
 282        if (err)
 283                goto out;
 284
 285        if (!sk_rmem_schedule(sk, skb->truesize)) {
 286                err = -ENOBUFS;
 287                goto out;
 288        }
 289
 290        skb->dev = NULL;
 291        skb_set_owner_r(skb, sk);
 292
 293        /* Cache the SKB length before we tack it onto the receive
 294         * queue.  Once it is added it no longer belongs to us and
 295         * may be freed by other threads of control pulling packets
 296         * from the queue.
 297         */
 298        skb_len = skb->len;
 299
 300        skb_queue_tail(&sk->sk_receive_queue, skb);
 301
 302        if (!sock_flag(sk, SOCK_DEAD))
 303                sk->sk_data_ready(sk, skb_len);
 304out:
 305        return err;
 306}
 307EXPORT_SYMBOL(sock_queue_rcv_skb);
 308
 309int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 310{
 311        int rc = NET_RX_SUCCESS;
 312
 313        if (sk_filter(sk, skb))
 314                goto discard_and_relse;
 315
 316        skb->dev = NULL;
 317
 318        if (nested)
 319                bh_lock_sock_nested(sk);
 320        else
 321                bh_lock_sock(sk);
 322        if (!sock_owned_by_user(sk)) {
 323                /*
 324                 * trylock + unlock semantics:
 325                 */
 326                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 327
 328                rc = sk_backlog_rcv(sk, skb);
 329
 330                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 331        } else
 332                sk_add_backlog(sk, skb);
 333        bh_unlock_sock(sk);
 334out:
 335        sock_put(sk);
 336        return rc;
 337discard_and_relse:
 338        kfree_skb(skb);
 339        goto out;
 340}
 341EXPORT_SYMBOL(sk_receive_skb);
 342
 343struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 344{
 345        struct dst_entry *dst = sk->sk_dst_cache;
 346
 347        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 348                sk->sk_dst_cache = NULL;
 349                dst_release(dst);
 350                return NULL;
 351        }
 352
 353        return dst;
 354}
 355EXPORT_SYMBOL(__sk_dst_check);
 356
 357struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 358{
 359        struct dst_entry *dst = sk_dst_get(sk);
 360
 361        if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 362                sk_dst_reset(sk);
 363                dst_release(dst);
 364                return NULL;
 365        }
 366
 367        return dst;
 368}
 369EXPORT_SYMBOL(sk_dst_check);
 370
 371static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 372{
 373        int ret = -ENOPROTOOPT;
 374#ifdef CONFIG_NETDEVICES
 375        struct net *net = sock_net(sk);
 376        char devname[IFNAMSIZ];
 377        int index;
 378
 379        /* Sorry... */
 380        ret = -EPERM;
 381        if (!capable(CAP_NET_RAW))
 382                goto out;
 383
 384        ret = -EINVAL;
 385        if (optlen < 0)
 386                goto out;
 387
 388        /* Bind this socket to a particular device like "eth0",
 389         * as specified in the passed interface name. If the
 390         * name is "" or the option length is zero the socket
 391         * is not bound.
 392         */
 393        if (optlen > IFNAMSIZ - 1)
 394                optlen = IFNAMSIZ - 1;
 395        memset(devname, 0, sizeof(devname));
 396
 397        ret = -EFAULT;
 398        if (copy_from_user(devname, optval, optlen))
 399                goto out;
 400
 401        if (devname[0] == '\0') {
 402                index = 0;
 403        } else {
 404                struct net_device *dev = dev_get_by_name(net, devname);
 405
 406                ret = -ENODEV;
 407                if (!dev)
 408                        goto out;
 409
 410                index = dev->ifindex;
 411                dev_put(dev);
 412        }
 413
 414        lock_sock(sk);
 415        sk->sk_bound_dev_if = index;
 416        sk_dst_reset(sk);
 417        release_sock(sk);
 418
 419        ret = 0;
 420
 421out:
 422#endif
 423
 424        return ret;
 425}
 426
 427static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 428{
 429        if (valbool)
 430                sock_set_flag(sk, bit);
 431        else
 432                sock_reset_flag(sk, bit);
 433}
 434
 435/*
 436 *        This is meant for all protocols to use and covers goings on
 437 *        at the socket level. Everything here is generic.
 438 */
 439
 440int sock_setsockopt(struct socket *sock, int level, int optname,
 441                    char __user *optval, int optlen)
 442{
 443        struct sock *sk=sock->sk;
 444        int val;
 445        int valbool;
 446        struct linger ling;
 447        int ret = 0;
 448
 449        /*
 450         *        Options without arguments
 451         */
 452
 453        if (optname == SO_BINDTODEVICE)
 454                return sock_bindtodevice(sk, optval, optlen);
 455
 456        if (optlen < sizeof(int))
 457                return -EINVAL;
 458
 459        if (get_user(val, (int __user *)optval))
 460                return -EFAULT;
 461
 462        valbool = val?1:0;
 463
 464        lock_sock(sk);
 465
 466        switch(optname) {
 467        case SO_DEBUG:
 468                if (val && !capable(CAP_NET_ADMIN)) {
 469                        ret = -EACCES;
 470                } else
 471                        sock_valbool_flag(sk, SOCK_DBG, valbool);
 472                break;
 473        case SO_REUSEADDR:
 474                sk->sk_reuse = valbool;
 475                break;
 476        case SO_TYPE:
 477        case SO_ERROR:
 478                ret = -ENOPROTOOPT;
 479                break;
 480        case SO_DONTROUTE:
 481                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 482                break;
 483        case SO_BROADCAST:
 484                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 485                break;
 486        case SO_SNDBUF:
 487                /* Don't error on this BSD doesn't and if you think
 488                   about it this is right. Otherwise apps have to
 489                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 490                   are treated in BSD as hints */
 491
 492                if (val > sysctl_wmem_max)
 493                        val = sysctl_wmem_max;
 494set_sndbuf:
 495                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 496                if ((val * 2) < SOCK_MIN_SNDBUF)
 497                        sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 498                else
 499                        sk->sk_sndbuf = val * 2;
 500
 501                /*
 502                 *        Wake up sending tasks if we
 503                 *        upped the value.
 504                 */
 505                sk->sk_write_space(sk);
 506                break;
 507
 508        case SO_SNDBUFFORCE:
 509                if (!capable(CAP_NET_ADMIN)) {
 510                        ret = -EPERM;
 511                        break;
 512                }
 513                goto set_sndbuf;
 514
 515        case SO_RCVBUF:
 516                /* Don't error on this BSD doesn't and if you think
 517                   about it this is right. Otherwise apps have to
 518                   play 'guess the biggest size' games. RCVBUF/SNDBUF
 519                   are treated in BSD as hints */
 520
 521                if (val > sysctl_rmem_max)
 522                        val = sysctl_rmem_max;
 523set_rcvbuf:
 524                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 525                /*
 526                 * We double it on the way in to account for
 527                 * "struct sk_buff" etc. overhead.   Applications
 528                 * assume that the SO_RCVBUF setting they make will
 529                 * allow that much actual data to be received on that
 530                 * socket.
 531                 *
 532                 * Applications are unaware that "struct sk_buff" and
 533                 * other overheads allocate from the receive buffer
 534                 * during socket buffer allocation.
 535                 *
 536                 * And after considering the possible alternatives,
 537                 * returning the value we actually used in getsockopt
 538                 * is the most desirable behavior.
 539                 */
 540                if ((val * 2) < SOCK_MIN_RCVBUF)
 541                        sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 542                else
 543                        sk->sk_rcvbuf = val * 2;
 544                break;
 545
 546        case SO_RCVBUFFORCE:
 547                if (!capable(CAP_NET_ADMIN)) {
 548                        ret = -EPERM;
 549                        break;
 550                }
 551                goto set_rcvbuf;
 552
 553        case SO_KEEPALIVE:
 554#ifdef CONFIG_INET
 555                if (sk->sk_protocol == IPPROTO_TCP)
 556                        tcp_set_keepalive(sk, valbool);
 557#endif
 558                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 559                break;
 560
 561        case SO_OOBINLINE:
 562                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 563                break;
 564
 565        case SO_NO_CHECK:
 566                sk->sk_no_check = valbool;
 567                break;
 568
 569        case SO_PRIORITY:
 570                if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 571                        sk->sk_priority = val;
 572                else
 573                        ret = -EPERM;
 574                break;
 575
 576        case SO_LINGER:
 577                if (optlen < sizeof(ling)) {
 578                        ret = -EINVAL;        /* 1003.1g */
 579                        break;
 580                }
 581                if (copy_from_user(&ling,optval,sizeof(ling))) {
 582                        ret = -EFAULT;
 583                        break;
 584                }
 585                if (!ling.l_onoff)
 586                        sock_reset_flag(sk, SOCK_LINGER);
 587                else {
 588#if (BITS_PER_LONG == 32)
 589                        if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 590                                sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 591                        else
 592#endif
 593                                sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 594                        sock_set_flag(sk, SOCK_LINGER);
 595                }
 596                break;
 597
 598        case SO_BSDCOMPAT:
 599                sock_warn_obsolete_bsdism("setsockopt");
 600                break;
 601
 602        case SO_PASSCRED:
 603                if (valbool)
 604                        set_bit(SOCK_PASSCRED, &sock->flags);
 605                else
 606                        clear_bit(SOCK_PASSCRED, &sock->flags);
 607                break;
 608
 609        case SO_TIMESTAMP:
 610        case SO_TIMESTAMPNS:
 611                if (valbool)  {
 612                        if (optname == SO_TIMESTAMP)
 613                                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 614                        else
 615                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 616                        sock_set_flag(sk, SOCK_RCVTSTAMP);
 617                        sock_enable_timestamp(sk);
 618                } else {
 619                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
 620                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 621                }
 622                break;
 623
 624        case SO_RCVLOWAT:
 625                if (val < 0)
 626                        val = INT_MAX;
 627                sk->sk_rcvlowat = val ? : 1;
 628                break;
 629
 630        case SO_RCVTIMEO:
 631                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 632                break;
 633
 634        case SO_SNDTIMEO:
 635                ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 636                break;
 637
 638        case SO_ATTACH_FILTER:
 639                ret = -EINVAL;
 640                if (optlen == sizeof(struct sock_fprog)) {
 641                        struct sock_fprog fprog;
 642
 643                        ret = -EFAULT;
 644                        if (copy_from_user(&fprog, optval, sizeof(fprog)))
 645                                break;
 646
 647                        ret = sk_attach_filter(&fprog, sk);
 648                }
 649                break;
 650
 651        case SO_DETACH_FILTER:
 652                ret = sk_detach_filter(sk);
 653                break;
 654
 655        case SO_PASSSEC:
 656                if (valbool)
 657                        set_bit(SOCK_PASSSEC, &sock->flags);
 658                else
 659                        clear_bit(SOCK_PASSSEC, &sock->flags);
 660                break;
 661        case SO_MARK:
 662                if (!capable(CAP_NET_ADMIN))
 663                        ret = -EPERM;
 664                else {
 665                        sk->sk_mark = val;
 666                }
 667                break;
 668
 669                /* We implement the SO_SNDLOWAT etc to
 670                   not be settable (1003.1g 5.3) */
 671        default:
 672                ret = -ENOPROTOOPT;
 673                break;
 674        }
 675        release_sock(sk);
 676        return ret;
 677}
 678
 679
 680int sock_getsockopt(struct socket *sock, int level, int optname,
 681                    char __user *optval, int __user *optlen)
 682{
 683        struct sock *sk = sock->sk;
 684
 685        union {
 686                int val;
 687                struct linger ling;
 688                struct timeval tm;
 689        } v;
 690
 691        unsigned int lv = sizeof(int);
 692        int len;
 693
 694        if (get_user(len, optlen))
 695                return -EFAULT;
 696        if (len < 0)
 697                return -EINVAL;
 698
 699        switch(optname) {
 700        case SO_DEBUG:
 701                v.val = sock_flag(sk, SOCK_DBG);
 702                break;
 703
 704        case SO_DONTROUTE:
 705                v.val = sock_flag(sk, SOCK_LOCALROUTE);
 706                break;
 707
 708        case SO_BROADCAST:
 709                v.val = !!sock_flag(sk, SOCK_BROADCAST);
 710                break;
 711
 712        case SO_SNDBUF:
 713                v.val = sk->sk_sndbuf;
 714                break;
 715
 716        case SO_RCVBUF:
 717                v.val = sk->sk_rcvbuf;
 718                break;
 719
 720        case SO_REUSEADDR:
 721                v.val = sk->sk_reuse;
 722                break;
 723
 724        case SO_KEEPALIVE:
 725                v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 726                break;
 727
 728        case SO_TYPE:
 729                v.val = sk->sk_type;
 730                break;
 731
 732        case SO_ERROR:
 733                v.val = -sock_error(sk);
 734                if (v.val==0)
 735                        v.val = xchg(&sk->sk_err_soft, 0);
 736                break;
 737
 738        case SO_OOBINLINE:
 739                v.val = !!sock_flag(sk, SOCK_URGINLINE);
 740                break;
 741
 742        case SO_NO_CHECK:
 743                v.val = sk->sk_no_check;
 744                break;
 745
 746        case SO_PRIORITY:
 747                v.val = sk->sk_priority;
 748                break;
 749
 750        case SO_LINGER:
 751                lv                = sizeof(v.ling);
 752                v.ling.l_onoff        = !!sock_flag(sk, SOCK_LINGER);
 753                v.ling.l_linger        = sk->sk_lingertime / HZ;
 754                break;
 755
 756        case SO_BSDCOMPAT:
 757                sock_warn_obsolete_bsdism("getsockopt");
 758                break;
 759
 760        case SO_TIMESTAMP:
 761                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 762                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
 763                break;
 764
 765        case SO_TIMESTAMPNS:
 766                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 767                break;
 768
 769        case SO_RCVTIMEO:
 770                lv=sizeof(struct timeval);
 771                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 772                        v.tm.tv_sec = 0;
 773                        v.tm.tv_usec = 0;
 774                } else {
 775                        v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 776                        v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 777                }
 778                break;
 779
 780        case SO_SNDTIMEO:
 781                lv=sizeof(struct timeval);
 782                if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 783                        v.tm.tv_sec = 0;
 784                        v.tm.tv_usec = 0;
 785                } else {
 786                        v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 787                        v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 788                }
 789                break;
 790
 791        case SO_RCVLOWAT:
 792                v.val = sk->sk_rcvlowat;
 793                break;
 794
 795        case SO_SNDLOWAT:
 796                v.val=1;
 797                break;
 798
 799        case SO_PASSCRED:
 800                v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 801                break;
 802
 803        case SO_PEERCRED:
 804                if (len > sizeof(sk->sk_peercred))
 805                        len = sizeof(sk->sk_peercred);
 806                if (copy_to_user(optval, &sk->sk_peercred, len))
 807                        return -EFAULT;
 808                goto lenout;
 809
 810        case SO_PEERNAME:
 811        {
 812                char address[128];
 813
 814                if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 815                        return -ENOTCONN;
 816                if (lv < len)
 817                        return -EINVAL;
 818                if (copy_to_user(optval, address, len))
 819                        return -EFAULT;
 820                goto lenout;
 821        }
 822
 823        /* Dubious BSD thing... Probably nobody even uses it, but
 824         * the UNIX standard wants it for whatever reason... -DaveM
 825         */
 826        case SO_ACCEPTCONN:
 827                v.val = sk->sk_state == TCP_LISTEN;
 828                break;
 829
 830        case SO_PASSSEC:
 831                v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 832                break;
 833
 834        case SO_PEERSEC:
 835                return security_socket_getpeersec_stream(sock, optval, optlen, len);
 836
 837        case SO_MARK:
 838                v.val = sk->sk_mark;
 839                break;
 840
 841        default:
 842                return -ENOPROTOOPT;
 843        }
 844
 845        if (len > lv)
 846                len = lv;
 847        if (copy_to_user(optval, &v, len))
 848                return -EFAULT;
 849lenout:
 850        if (put_user(len, optlen))
 851                return -EFAULT;
 852        return 0;
 853}
 854
 855/*
 856 * Initialize an sk_lock.
 857 *
 858 * (We also register the sk_lock with the lock validator.)
 859 */
 860static inline void sock_lock_init(struct sock *sk)
 861{
 862        sock_lock_init_class_and_name(sk,
 863                        af_family_slock_key_strings[sk->sk_family],
 864                        af_family_slock_keys + sk->sk_family,
 865                        af_family_key_strings[sk->sk_family],
 866                        af_family_keys + sk->sk_family);
 867}
 868
 869static void sock_copy(struct sock *nsk, const struct sock *osk)
 870{
 871#ifdef CONFIG_SECURITY_NETWORK
 872        void *sptr = nsk->sk_security;
 873#endif
 874
 875        memcpy(nsk, osk, osk->sk_prot->obj_size);
 876#ifdef CONFIG_SECURITY_NETWORK
 877        nsk->sk_security = sptr;
 878        security_sk_clone(osk, nsk);
 879#endif
 880}
 881
 882static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 883                int family)
 884{
 885        struct sock *sk;
 886        struct kmem_cache *slab;
 887
 888        slab = prot->slab;
 889        if (slab != NULL)
 890                sk = kmem_cache_alloc(slab, priority);
 891        else
 892                sk = kmalloc(prot->obj_size, priority);
 893
 894        if (sk != NULL) {
 895                if (security_sk_alloc(sk, family, priority))
 896                        goto out_free;
 897
 898                if (!try_module_get(prot->owner))
 899                        goto out_free_sec;
 900        }
 901
 902        return sk;
 903
 904out_free_sec:
 905        security_sk_free(sk);
 906out_free:
 907        if (slab != NULL)
 908                kmem_cache_free(slab, sk);
 909        else
 910                kfree(sk);
 911        return NULL;
 912}
 913
 914static void sk_prot_free(struct proto *prot, struct sock *sk)
 915{
 916        struct kmem_cache *slab;
 917        struct module *owner;
 918
 919        owner = prot->owner;
 920        slab = prot->slab;
 921
 922        security_sk_free(sk);
 923        if (slab != NULL)
 924                kmem_cache_free(slab, sk);
 925        else
 926                kfree(sk);
 927        module_put(owner);
 928}
 929
 930/**
 931 *        sk_alloc - All socket objects are allocated here
 932 *        @net: the applicable net namespace
 933 *        @family: protocol family
 934 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 935 *        @prot: struct proto associated with this new sock instance
 936 */
 937struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 938                      struct proto *prot)
 939{
 940        struct sock *sk;
 941
 942        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
 943        if (sk) {
 944                sk->sk_family = family;
 945                /*
 946                 * See comment in struct sock definition to understand
 947                 * why we need sk_prot_creator -acme
 948                 */
 949                sk->sk_prot = sk->sk_prot_creator = prot;
 950                sock_lock_init(sk);
 951                sock_net_set(sk, get_net(net));
 952        }
 953
 954        return sk;
 955}
 956
 957void sk_free(struct sock *sk)
 958{
 959        struct sk_filter *filter;
 960
 961        if (sk->sk_destruct)
 962                sk->sk_destruct(sk);
 963
 964        filter = rcu_dereference(sk->sk_filter);
 965        if (filter) {
 966                sk_filter_uncharge(sk, filter);
 967                rcu_assign_pointer(sk->sk_filter, NULL);
 968        }
 969
 970        sock_disable_timestamp(sk);
 971
 972        if (atomic_read(&sk->sk_omem_alloc))
 973                printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
 974                       __func__, atomic_read(&sk->sk_omem_alloc));
 975
 976        put_net(sock_net(sk));
 977        sk_prot_free(sk->sk_prot_creator, sk);
 978}
 979
 980/*
 981 * Last sock_put should drop referrence to sk->sk_net. It has already
 982 * been dropped in sk_change_net. Taking referrence to stopping namespace
 983 * is not an option.
 984 * Take referrence to a socket to remove it from hash _alive_ and after that
 985 * destroy it in the context of init_net.
 986 */
 987void sk_release_kernel(struct sock *sk)
 988{
 989        if (sk == NULL || sk->sk_socket == NULL)
 990                return;
 991
 992        sock_hold(sk);
 993        sock_release(sk->sk_socket);
 994        release_net(sock_net(sk));
 995        sock_net_set(sk, get_net(&init_net));
 996        sock_put(sk);
 997}
 998EXPORT_SYMBOL(sk_release_kernel);
 999
1000struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1001{
1002        struct sock *newsk;
1003
1004        newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1005        if (newsk != NULL) {
1006                struct sk_filter *filter;
1007
1008                sock_copy(newsk, sk);
1009
1010                /* SANITY */
1011                get_net(sock_net(newsk));
1012                sk_node_init(&newsk->sk_node);
1013                sock_lock_init(newsk);
1014                bh_lock_sock(newsk);
1015                newsk->sk_backlog.head        = newsk->sk_backlog.tail = NULL;
1016
1017                atomic_set(&newsk->sk_rmem_alloc, 0);
1018                atomic_set(&newsk->sk_wmem_alloc, 0);
1019                atomic_set(&newsk->sk_omem_alloc, 0);
1020                skb_queue_head_init(&newsk->sk_receive_queue);
1021                skb_queue_head_init(&newsk->sk_write_queue);
1022#ifdef CONFIG_NET_DMA
1023                skb_queue_head_init(&newsk->sk_async_wait_queue);
1024#endif
1025
1026                rwlock_init(&newsk->sk_dst_lock);
1027                rwlock_init(&newsk->sk_callback_lock);
1028                lockdep_set_class_and_name(&newsk->sk_callback_lock,
1029                                af_callback_keys + newsk->sk_family,
1030                                af_family_clock_key_strings[newsk->sk_family]);
1031
1032                newsk->sk_dst_cache        = NULL;
1033                newsk->sk_wmem_queued        = 0;
1034                newsk->sk_forward_alloc = 0;
1035                newsk->sk_send_head        = NULL;
1036                newsk->sk_userlocks        = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1037
1038                sock_reset_flag(newsk, SOCK_DONE);
1039                skb_queue_head_init(&newsk->sk_error_queue);
1040
1041                filter = newsk->sk_filter;
1042                if (filter != NULL)
1043                        sk_filter_charge(newsk, filter);
1044
1045                if (unlikely(xfrm_sk_clone_policy(newsk))) {
1046                        /* It is still raw copy of parent, so invalidate
1047                         * destructor and make plain sk_free() */
1048                        newsk->sk_destruct = NULL;
1049                        sk_free(newsk);
1050                        newsk = NULL;
1051                        goto out;
1052                }
1053
1054                newsk->sk_err           = 0;
1055                newsk->sk_priority = 0;
1056                atomic_set(&newsk->sk_refcnt, 2);
1057
1058                /*
1059                 * Increment the counter in the same struct proto as the master
1060                 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1061                 * is the same as sk->sk_prot->socks, as this field was copied
1062                 * with memcpy).
1063                 *
1064                 * This _changes_ the previous behaviour, where
1065                 * tcp_create_openreq_child always was incrementing the
1066                 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1067                 * to be taken into account in all callers. -acme
1068                 */
1069                sk_refcnt_debug_inc(newsk);
1070                sk_set_socket(newsk, NULL);
1071                newsk->sk_sleep         = NULL;
1072
1073                if (newsk->sk_prot->sockets_allocated)
1074                        atomic_inc(newsk->sk_prot->sockets_allocated);
1075        }
1076out:
1077        return newsk;
1078}
1079
1080EXPORT_SYMBOL_GPL(sk_clone);
1081
1082void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1083{
1084        __sk_dst_set(sk, dst);
1085        sk->sk_route_caps = dst->dev->features;
1086        if (sk->sk_route_caps & NETIF_F_GSO)
1087                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1088        if (sk_can_gso(sk)) {
1089                if (dst->header_len) {
1090                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1091                } else {
1092                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1093                        sk->sk_gso_max_size = dst->dev->gso_max_size;
1094                }
1095        }
1096}
1097EXPORT_SYMBOL_GPL(sk_setup_caps);
1098
1099void __init sk_init(void)
1100{
1101        if (num_physpages <= 4096) {
1102                sysctl_wmem_max = 32767;
1103                sysctl_rmem_max = 32767;
1104                sysctl_wmem_default = 32767;
1105                sysctl_rmem_default = 32767;
1106        } else if (num_physpages >= 131072) {
1107                sysctl_wmem_max = 131071;
1108                sysctl_rmem_max = 131071;
1109        }
1110}
1111
1112/*
1113 *        Simple resource managers for sockets.
1114 */
1115
1116
1117/*
1118 * Write buffer destructor automatically called from kfree_skb.
1119 */
1120void sock_wfree(struct sk_buff *skb)
1121{
1122        struct sock *sk = skb->sk;
1123
1124        /* In case it might be waiting for more memory. */
1125        atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1126        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1127                sk->sk_write_space(sk);
1128        sock_put(sk);
1129}
1130
1131/*
1132 * Read buffer destructor automatically called from kfree_skb.
1133 */
1134void sock_rfree(struct sk_buff *skb)
1135{
1136        struct sock *sk = skb->sk;
1137
1138        skb_truesize_check(skb);
1139        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1140        sk_mem_uncharge(skb->sk, skb->truesize);
1141}
1142
1143
1144int sock_i_uid(struct sock *sk)
1145{
1146        int uid;
1147
1148        read_lock(&sk->sk_callback_lock);
1149        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1150        read_unlock(&sk->sk_callback_lock);
1151        return uid;
1152}
1153
1154unsigned long sock_i_ino(struct sock *sk)
1155{
1156        unsigned long ino;
1157
1158        read_lock(&sk->sk_callback_lock);
1159        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1160        read_unlock(&sk->sk_callback_lock);
1161        return ino;
1162}
1163
1164/*
1165 * Allocate a skb from the socket's send buffer.
1166 */
1167struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1168                             gfp_t priority)
1169{
1170        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1171                struct sk_buff * skb = alloc_skb(size, priority);
1172                if (skb) {
1173                        skb_set_owner_w(skb, sk);
1174                        return skb;
1175                }
1176        }
1177        return NULL;
1178}
1179
1180/*
1181 * Allocate a skb from the socket's receive buffer.
1182 */
1183struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1184                             gfp_t priority)
1185{
1186        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1187                struct sk_buff *skb = alloc_skb(size, priority);
1188                if (skb) {
1189                        skb_set_owner_r(skb, sk);
1190                        return skb;
1191                }
1192        }
1193        return NULL;
1194}
1195
1196/*
1197 * Allocate a memory block from the socket's option memory buffer.
1198 */
1199void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1200{
1201        if ((unsigned)size <= sysctl_optmem_max &&
1202            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1203                void *mem;
1204                /* First do the add, to avoid the race if kmalloc
1205                 * might sleep.
1206                 */
1207                atomic_add(size, &sk->sk_omem_alloc);
1208                mem = kmalloc(size, priority);
1209                if (mem)
1210                        return mem;
1211                atomic_sub(size, &sk->sk_omem_alloc);
1212        }
1213        return NULL;
1214}
1215
1216/*
1217 * Free an option memory block.
1218 */
1219void sock_kfree_s(struct sock *sk, void *mem, int size)
1220{
1221        kfree(mem);
1222        atomic_sub(size, &sk->sk_omem_alloc);
1223}
1224
1225/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1226   I think, these locks should be removed for datagram sockets.
1227 */
1228static long sock_wait_for_wmem(struct sock * sk, long timeo)
1229{
1230        DEFINE_WAIT(wait);
1231
1232        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1233        for (;;) {
1234                if (!timeo)
1235                        break;
1236                if (signal_pending(current))
1237                        break;
1238                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1239                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1240                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1241                        break;
1242                if (sk->sk_shutdown & SEND_SHUTDOWN)
1243                        break;
1244                if (sk->sk_err)
1245                        break;
1246                timeo = schedule_timeout(timeo);
1247        }
1248        finish_wait(sk->sk_sleep, &wait);
1249        return timeo;
1250}
1251
1252
1253/*
1254 *        Generic send/receive buffer handlers
1255 */
1256
1257static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1258                                            unsigned long header_len,
1259                                            unsigned long data_len,
1260                                            int noblock, int *errcode)
1261{
1262        struct sk_buff *skb;
1263        gfp_t gfp_mask;
1264        long timeo;
1265        int err;
1266
1267        gfp_mask = sk->sk_allocation;
1268        if (gfp_mask & __GFP_WAIT)
1269                gfp_mask |= __GFP_REPEAT;
1270
1271        timeo = sock_sndtimeo(sk, noblock);
1272        while (1) {
1273                err = sock_error(sk);
1274                if (err != 0)
1275                        goto failure;
1276
1277                err = -EPIPE;
1278                if (sk->sk_shutdown & SEND_SHUTDOWN)
1279                        goto failure;
1280
1281                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1282                        skb = alloc_skb(header_len, gfp_mask);
1283                        if (skb) {
1284                                int npages;
1285                                int i;
1286
1287                                /* No pages, we're done... */
1288                                if (!data_len)
1289                                        break;
1290
1291                                npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1292                                skb->truesize += data_len;
1293                                skb_shinfo(skb)->nr_frags = npages;
1294                                for (i = 0; i < npages; i++) {
1295                                        struct page *page;
1296                                        skb_frag_t *frag;
1297
1298                                        page = alloc_pages(sk->sk_allocation, 0);
1299                                        if (!page) {
1300                                                err = -ENOBUFS;
1301                                                skb_shinfo(skb)->nr_frags = i;
1302                                                kfree_skb(skb);
1303                                                goto failure;
1304                                        }
1305
1306                                        frag = &skb_shinfo(skb)->frags[i];
1307                                        frag->page = page;
1308                                        frag->page_offset = 0;
1309                                        frag->size = (data_len >= PAGE_SIZE ?
1310                                                      PAGE_SIZE :
1311                                                      data_len);
1312                                        data_len -= PAGE_SIZE;
1313                                }
1314
1315                                /* Full success... */
1316                                break;
1317                        }
1318                        err = -ENOBUFS;
1319                        goto failure;
1320                }
1321                set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1322                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1323                err = -EAGAIN;
1324                if (!timeo)
1325                        goto failure;
1326                if (signal_pending(current))
1327                        goto interrupted;
1328                timeo = sock_wait_for_wmem(sk, timeo);
1329        }
1330
1331        skb_set_owner_w(skb, sk);
1332        return skb;
1333
1334interrupted:
1335        err = sock_intr_errno(timeo);
1336failure:
1337        *errcode = err;
1338        return NULL;
1339}
1340
1341struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1342                                    int noblock, int *errcode)
1343{
1344        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1345}
1346
1347static void __lock_sock(struct sock *sk)
1348{
1349        DEFINE_WAIT(wait);
1350
1351        for (;;) {
1352                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1353                                        TASK_UNINTERRUPTIBLE);
1354                spin_unlock_bh(&sk->sk_lock.slock);
1355                schedule();
1356                spin_lock_bh(&sk->sk_lock.slock);
1357                if (!sock_owned_by_user(sk))
1358                        break;
1359        }
1360        finish_wait(&sk->sk_lock.wq, &wait);
1361}
1362
1363static void __release_sock(struct sock *sk)
1364{
1365        struct sk_buff *skb = sk->sk_backlog.head;
1366
1367        do {
1368                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1369                bh_unlock_sock(sk);
1370
1371                do {
1372                        struct sk_buff *next = skb->next;
1373
1374                        skb->next = NULL;
1375                        sk_backlog_rcv(sk, skb);
1376
1377                        /*
1378                         * We are in process context here with softirqs
1379                         * disabled, use cond_resched_softirq() to preempt.
1380                         * This is safe to do because we've taken the backlog
1381                         * queue private:
1382                         */
1383                        cond_resched_softirq();
1384
1385                        skb = next;
1386                } while (skb != NULL);
1387
1388                bh_lock_sock(sk);
1389        } while ((skb = sk->sk_backlog.head) != NULL);
1390}
1391
1392/**
1393 * sk_wait_data - wait for data to arrive at sk_receive_queue
1394 * @sk:    sock to wait on
1395 * @timeo: for how long
1396 *
1397 * Now socket state including sk->sk_err is changed only under lock,
1398 * hence we may omit checks after joining wait queue.
1399 * We check receive queue before schedule() only as optimization;
1400 * it is very likely that release_sock() added new data.
1401 */
1402int sk_wait_data(struct sock *sk, long *timeo)
1403{
1404        int rc;
1405        DEFINE_WAIT(wait);
1406
1407        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1408        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1409        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1410        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1411        finish_wait(sk->sk_sleep, &wait);
1412        return rc;
1413}
1414
1415EXPORT_SYMBOL(sk_wait_data);
1416
1417/**
1418 *        __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1419 *        @sk: socket
1420 *        @size: memory size to allocate
1421 *        @kind: allocation type
1422 *
1423 *        If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1424 *        rmem allocation. This function assumes that protocols which have
1425 *        memory_pressure use sk_wmem_queued as write buffer accounting.
1426 */
1427int __sk_mem_schedule(struct sock *sk, int size, int kind)
1428{
1429        struct proto *prot = sk->sk_prot;
1430        int amt = sk_mem_pages(size);
1431        int allocated;
1432
1433        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1434        allocated = atomic_add_return(amt, prot->memory_allocated);
1435
1436        /* Under limit. */
1437        if (allocated <= prot->sysctl_mem[0]) {
1438                if (prot->memory_pressure && *prot->memory_pressure)
1439                        *prot->memory_pressure = 0;
1440                return 1;
1441        }
1442
1443        /* Under pressure. */
1444        if (allocated > prot->sysctl_mem[1])
1445                if (prot->enter_memory_pressure)
1446                        prot->enter_memory_pressure(sk);
1447
1448        /* Over hard limit. */
1449        if (allocated > prot->sysctl_mem[2])
1450                goto suppress_allocation;
1451
1452        /* guarantee minimum buffer size under pressure */
1453        if (kind == SK_MEM_RECV) {
1454                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1455                        return 1;
1456        } else { /* SK_MEM_SEND */
1457                if (sk->sk_type == SOCK_STREAM) {
1458                        if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1459                                return 1;
1460                } else if (atomic_read(&sk->sk_wmem_alloc) <
1461                           prot->sysctl_wmem[0])
1462                                return 1;
1463        }
1464
1465        if (prot->memory_pressure) {
1466                if (!*prot->memory_pressure ||
1467                    prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
1468                    sk_mem_pages(sk->sk_wmem_queued +
1469                                 atomic_read(&sk->sk_rmem_alloc) +
1470                                 sk->sk_forward_alloc))
1471                        return 1;
1472        }
1473
1474suppress_allocation:
1475
1476        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1477                sk_stream_moderate_sndbuf(sk);
1478
1479                /* Fail only if socket is _under_ its sndbuf.
1480                 * In this case we cannot block, so that we have to fail.
1481                 */
1482                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1483                        return 1;
1484        }
1485
1486        /* Alas. Undo changes. */
1487        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1488        atomic_sub(amt, prot->memory_allocated);
1489        return 0;
1490}
1491
1492EXPORT_SYMBOL(__sk_mem_schedule);
1493
1494/**
1495 *        __sk_reclaim - reclaim memory_allocated
1496 *        @sk: socket
1497 */
1498void __sk_mem_reclaim(struct sock *sk)
1499{
1500        struct proto *prot = sk->sk_prot;
1501
1502        atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1503                   prot->memory_allocated);
1504        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1505
1506        if (prot->memory_pressure && *prot->memory_pressure &&
1507            (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1508                *prot->memory_pressure = 0;
1509}
1510
1511EXPORT_SYMBOL(__sk_mem_reclaim);
1512
1513
1514/*
1515 * Set of default routines for initialising struct proto_ops when
1516 * the protocol does not support a particular function. In certain
1517 * cases where it makes no sense for a protocol to have a "do nothing"
1518 * function, some default processing is provided.
1519 */
1520
1521int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1522{
1523        return -EOPNOTSUPP;
1524}
1525
1526int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1527                    int len, int flags)
1528{
1529        return -EOPNOTSUPP;
1530}
1531
1532int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1533{
1534        return -EOPNOTSUPP;
1535}
1536
1537int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1538{
1539        return -EOPNOTSUPP;
1540}
1541
1542int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1543                    int *len, int peer)
1544{
1545        return -EOPNOTSUPP;
1546}
1547
1548unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1549{
1550        return 0;
1551}
1552
1553int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1554{
1555        return -EOPNOTSUPP;
1556}
1557
1558int sock_no_listen(struct socket *sock, int backlog)
1559{
1560        return -EOPNOTSUPP;
1561}
1562
1563int sock_no_shutdown(struct socket *sock, int how)
1564{
1565        return -EOPNOTSUPP;
1566}
1567
1568int sock_no_setsockopt(struct socket *sock, int level, int optname,
1569                    char __user *optval, int optlen)
1570{
1571        return -EOPNOTSUPP;
1572}
1573
1574int sock_no_getsockopt(struct socket *sock, int level, int optname,
1575                    char __user *optval, int __user *optlen)
1576{
1577        return -EOPNOTSUPP;
1578}
1579
1580int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1581                    size_t len)
1582{
1583        return -EOPNOTSUPP;
1584}
1585
1586int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1587                    size_t len, int flags)
1588{
1589        return -EOPNOTSUPP;
1590}
1591
1592int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1593{
1594        /* Mirror missing mmap method error code */
1595        return -ENODEV;
1596}
1597
1598ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1599{
1600        ssize_t res;
1601        struct msghdr msg = {.msg_flags = flags};
1602        struct kvec iov;
1603        char *kaddr = kmap(page);
1604        iov.iov_base = kaddr + offset;
1605        iov.iov_len = size;
1606        res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1607        kunmap(page);
1608        return res;
1609}
1610
1611/*
1612 *        Default Socket Callbacks
1613 */
1614
1615static void sock_def_wakeup(struct sock *sk)
1616{
1617        read_lock(&sk->sk_callback_lock);
1618        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1619                wake_up_interruptible_all(sk->sk_sleep);
1620        read_unlock(&sk->sk_callback_lock);
1621}
1622
1623static void sock_def_error_report(struct sock *sk)
1624{
1625        read_lock(&sk->sk_callback_lock);
1626        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1627                wake_up_interruptible(sk->sk_sleep);
1628        sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1629        read_unlock(&sk->sk_callback_lock);
1630}
1631
1632static void sock_def_readable(struct sock *sk, int len)
1633{
1634        read_lock(&sk->sk_callback_lock);
1635        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1636                wake_up_interruptible_sync(sk->sk_sleep);
1637        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1638        read_unlock(&sk->sk_callback_lock);
1639}
1640
1641static void sock_def_write_space(struct sock *sk)
1642{
1643        read_lock(&sk->sk_callback_lock);
1644
1645        /* Do not wake up a writer until he can make "significant"
1646         * progress.  --DaveM
1647         */
1648        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1649                if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1650                        wake_up_interruptible_sync(sk->sk_sleep);
1651
1652                /* Should agree with poll, otherwise some programs break */
1653                if (sock_writeable(sk))
1654                        sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1655        }
1656
1657        read_unlock(&sk->sk_callback_lock);
1658}
1659
1660static void sock_def_destruct(struct sock *sk)
1661{
1662        kfree(sk->sk_protinfo);
1663}
1664
1665void sk_send_sigurg(struct sock *sk)
1666{
1667        if (sk->sk_socket && sk->sk_socket->file)
1668                if (send_sigurg(&sk->sk_socket->file->f_owner))
1669                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1670}
1671
1672void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1673                    unsigned long expires)
1674{
1675        if (!mod_timer(timer, expires))
1676                sock_hold(sk);
1677}
1678
1679EXPORT_SYMBOL(sk_reset_timer);
1680
1681void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1682{
1683        if (timer_pending(timer) && del_timer(timer))
1684                __sock_put(sk);
1685}
1686
1687EXPORT_SYMBOL(sk_stop_timer);
1688
1689void sock_init_data(struct socket *sock, struct sock *sk)
1690{
1691        skb_queue_head_init(&sk->sk_receive_queue);
1692        skb_queue_head_init(&sk->sk_write_queue);
1693        skb_queue_head_init(&sk->sk_error_queue);
1694#ifdef CONFIG_NET_DMA
1695        skb_queue_head_init(&sk->sk_async_wait_queue);
1696#endif
1697
1698        sk->sk_send_head        =        NULL;
1699
1700        init_timer(&sk->sk_timer);
1701
1702        sk->sk_allocation        =        GFP_KERNEL;
1703        sk->sk_rcvbuf                =        sysctl_rmem_default;
1704        sk->sk_sndbuf                =        sysctl_wmem_default;
1705        sk->sk_state                =        TCP_CLOSE;
1706        sk_set_socket(sk, sock);
1707
1708        sock_set_flag(sk, SOCK_ZAPPED);
1709
1710        if (sock) {
1711                sk->sk_type        =        sock->type;
1712                sk->sk_sleep        =        &sock->wait;
1713                sock->sk        =        sk;
1714        } else
1715                sk->sk_sleep        =        NULL;
1716
1717        rwlock_init(&sk->sk_dst_lock);
1718        rwlock_init(&sk->sk_callback_lock);
1719        lockdep_set_class_and_name(&sk->sk_callback_lock,
1720                        af_callback_keys + sk->sk_family,
1721                        af_family_clock_key_strings[sk->sk_family]);
1722
1723        sk->sk_state_change        =        sock_def_wakeup;
1724        sk->sk_data_ready        =        sock_def_readable;
1725        sk->sk_write_space        =        sock_def_write_space;
1726        sk->sk_error_report        =        sock_def_error_report;
1727        sk->sk_destruct                =        sock_def_destruct;
1728
1729        sk->sk_sndmsg_page        =        NULL;
1730        sk->sk_sndmsg_off        =        0;
1731
1732        sk->sk_peercred.pid         =        0;
1733        sk->sk_peercred.uid        =        -1;
1734        sk->sk_peercred.gid        =        -1;
1735        sk->sk_write_pending        =        0;
1736        sk->sk_rcvlowat                =        1;
1737        sk->sk_rcvtimeo                =        MAX_SCHEDULE_TIMEOUT;
1738        sk->sk_sndtimeo                =        MAX_SCHEDULE_TIMEOUT;
1739
1740        sk->sk_stamp = ktime_set(-1L, 0);
1741
1742        atomic_set(&sk->sk_refcnt, 1);
1743        atomic_set(&sk->sk_drops, 0);
1744}
1745
1746void lock_sock_nested(struct sock *sk, int subclass)
1747{
1748        might_sleep();
1749        spin_lock_bh(&sk->sk_lock.slock);
1750        if (sk->sk_lock.owned)
1751                __lock_sock(sk);
1752        sk->sk_lock.owned = 1;
1753        spin_unlock(&sk->sk_lock.slock);
1754        /*
1755         * The sk_lock has mutex_lock() semantics here:
1756         */
1757        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1758        local_bh_enable();
1759}
1760
1761EXPORT_SYMBOL(lock_sock_nested);
1762
1763void release_sock(struct sock *sk)
1764{
1765        /*
1766         * The sk_lock has mutex_unlock() semantics:
1767         */
1768        mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1769
1770        spin_lock_bh(&sk->sk_lock.slock);
1771        if (sk->sk_backlog.tail)
1772                __release_sock(sk);
1773        sk->sk_lock.owned = 0;
1774        if (waitqueue_active(&sk->sk_lock.wq))
1775                wake_up(&sk->sk_lock.wq);
1776        spin_unlock_bh(&sk->sk_lock.slock);
1777}
1778EXPORT_SYMBOL(release_sock);
1779
1780int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1781{
1782        struct timeval tv;
1783        if (!sock_flag(sk, SOCK_TIMESTAMP))
1784                sock_enable_timestamp(sk);
1785        tv = ktime_to_timeval(sk->sk_stamp);
1786        if (tv.tv_sec == -1)
1787                return -ENOENT;
1788        if (tv.tv_sec == 0) {
1789                sk->sk_stamp = ktime_get_real();
1790                tv = ktime_to_timeval(sk->sk_stamp);
1791        }
1792        return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1793}
1794EXPORT_SYMBOL(sock_get_timestamp);
1795
1796int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1797{
1798        struct timespec ts;
1799        if (!sock_flag(sk, SOCK_TIMESTAMP))
1800                sock_enable_timestamp(sk);
1801        ts = ktime_to_timespec(sk->sk_stamp);
1802        if (ts.tv_sec == -1)
1803                return -ENOENT;
1804        if (ts.tv_sec == 0) {
1805                sk->sk_stamp = ktime_get_real();
1806                ts = ktime_to_timespec(sk->sk_stamp);
1807        }
1808        return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1809}
1810EXPORT_SYMBOL(sock_get_timestampns);
1811
1812void sock_enable_timestamp(struct sock *sk)
1813{
1814        if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1815                sock_set_flag(sk, SOCK_TIMESTAMP);
1816                net_enable_timestamp();
1817        }
1818}
1819
1820/*
1821 *        Get a socket option on an socket.
1822 *
1823 *        FIX: POSIX 1003.1g is very ambiguous here. It states that
1824 *        asynchronous errors should be reported by getsockopt. We assume
1825 *        this means if you specify SO_ERROR (otherwise whats the point of it).
1826 */
1827int sock_common_getsockopt(struct socket *sock, int level, int optname,
1828                           char __user *optval, int __user *optlen)
1829{
1830        struct sock *sk = sock->sk;
1831
1832        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1833}
1834
1835EXPORT_SYMBOL(sock_common_getsockopt);
1836
1837#ifdef CONFIG_COMPAT
1838int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1839                                  char __user *optval, int __user *optlen)
1840{
1841        struct sock *sk = sock->sk;
1842
1843        if (sk->sk_prot->compat_getsockopt != NULL)
1844                return sk->sk_prot->compat_getsockopt(sk, level, optname,
1845                                                      optval, optlen);
1846        return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1847}
1848EXPORT_SYMBOL(compat_sock_common_getsockopt);
1849#endif
1850
1851int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1852                        struct msghdr *msg, size_t size, int flags)
1853{
1854        struct sock *sk = sock->sk;
1855        int addr_len = 0;
1856        int err;
1857
1858        err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1859                                   flags & ~MSG_DONTWAIT, &addr_len);
1860        if (err >= 0)
1861                msg->msg_namelen = addr_len;
1862        return err;
1863}
1864
1865EXPORT_SYMBOL(sock_common_recvmsg);
1866
1867/*
1868 *        Set socket options on an inet socket.
1869 */
1870int sock_common_setsockopt(struct socket *sock, int level, int optname,
1871                           char __user *optval, int optlen)
1872{
1873        struct sock *sk = sock->sk;
1874
1875        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1876}
1877
1878EXPORT_SYMBOL(sock_common_setsockopt);
1879
1880#ifdef CONFIG_COMPAT
1881int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1882                                  char __user *optval, int optlen)
1883{
1884        struct sock *sk = sock->sk;
1885
1886        if (sk->sk_prot->compat_setsockopt != NULL)
1887                return sk->sk_prot->compat_setsockopt(sk, level, optname,
1888                                                      optval, optlen);
1889        return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1890}
1891EXPORT_SYMBOL(compat_sock_common_setsockopt);
1892#endif
1893
1894void sk_common_release(struct sock *sk)
1895{
1896        if (sk->sk_prot->destroy)
1897                sk->sk_prot->destroy(sk);
1898
1899        /*
1900         * Observation: when sock_common_release is called, processes have
1901         * no access to socket. But net still has.
1902         * Step one, detach it from networking:
1903         *
1904         * A. Remove from hash tables.
1905         */
1906
1907        sk->sk_prot->unhash(sk);
1908
1909        /*
1910         * In this point socket cannot receive new packets, but it is possible
1911         * that some packets are in flight because some CPU runs receiver and
1912         * did hash table lookup before we unhashed socket. They will achieve
1913         * receive queue and will be purged by socket destructor.
1914         *
1915         * Also we still have packets pending on receive queue and probably,
1916         * our own packets waiting in device queues. sock_destroy will drain
1917         * receive queue, but transmitted packets will delay socket destruction
1918         * until the last reference will be released.
1919         */
1920
1921        sock_orphan(sk);
1922
1923        xfrm_sk_free_policy(sk);
1924
1925        sk_refcnt_debug_release(sk);
1926        sock_put(sk);
1927}
1928
1929EXPORT_SYMBOL(sk_common_release);
1930
1931static DEFINE_RWLOCK(proto_list_lock);
1932static LIST_HEAD(proto_list);
1933
1934#ifdef CONFIG_PROC_FS
1935#define PROTO_INUSE_NR        64        /* should be enough for the first time */
1936struct prot_inuse {
1937        int val[PROTO_INUSE_NR];
1938};
1939
1940static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
1941
1942#ifdef CONFIG_NET_NS
1943void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1944{
1945        int cpu = smp_processor_id();
1946        per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
1947}
1948EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1949
1950int sock_prot_inuse_get(struct net *net, struct proto *prot)
1951{
1952        int cpu, idx = prot->inuse_idx;
1953        int res = 0;
1954
1955        for_each_possible_cpu(cpu)
1956                res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
1957
1958        return res >= 0 ? res : 0;
1959}
1960EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
1961
1962static int sock_inuse_init_net(struct net *net)
1963{
1964        net->core.inuse = alloc_percpu(struct prot_inuse);
1965        return net->core.inuse ? 0 : -ENOMEM;
1966}
1967
1968static void sock_inuse_exit_net(struct net *net)
1969{
1970        free_percpu(net->core.inuse);
1971}
1972
1973static struct pernet_operations net_inuse_ops = {
1974        .init = sock_inuse_init_net,
1975        .exit = sock_inuse_exit_net,
1976};
1977
1978static __init int net_inuse_init(void)
1979{
1980        if (register_pernet_subsys(&net_inuse_ops))
1981                panic("Cannot initialize net inuse counters");
1982
1983        return 0;
1984}
1985
1986core_initcall(net_inuse_init);
1987#else
1988static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
1989
1990void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1991{
1992        __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
1993}
1994EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1995
1996int sock_prot_inuse_get(struct net *net, struct proto *prot)
1997{
1998        int cpu, idx = prot->inuse_idx;
1999        int res = 0;
2000
2001        for_each_possible_cpu(cpu)
2002                res += per_cpu(prot_inuse, cpu).val[idx];
2003
2004        return res >= 0 ? res : 0;
2005}
2006EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2007#endif
2008
2009static void assign_proto_idx(struct proto *prot)
2010{
2011        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2012
2013        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2014                printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2015                return;
2016        }
2017
2018        set_bit(prot->inuse_idx, proto_inuse_idx);
2019}
2020
2021static void release_proto_idx(struct proto *prot)
2022{
2023        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2024                clear_bit(prot->inuse_idx, proto_inuse_idx);
2025}
2026#else
2027static inline void assign_proto_idx(struct proto *prot)
2028{
2029}
2030
2031static inline void release_proto_idx(struct proto *prot)
2032{
2033}
2034#endif
2035
2036int proto_register(struct proto *prot, int alloc_slab)
2037{
2038        if (alloc_slab) {
2039                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2040                                               SLAB_HWCACHE_ALIGN, NULL);
2041
2042                if (prot->slab == NULL) {
2043                        printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2044                               prot->name);
2045                        goto out;
2046                }
2047
2048                if (prot->rsk_prot != NULL) {
2049                        static const char mask[] = "request_sock_%s";
2050
2051                        prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2052                        if (prot->rsk_prot->slab_name == NULL)
2053                                goto out_free_sock_slab;
2054
2055                        sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2056                        prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2057                                                                 prot->rsk_prot->obj_size, 0,
2058                                                                 SLAB_HWCACHE_ALIGN, NULL);
2059
2060                        if (prot->rsk_prot->slab == NULL) {
2061                                printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2062                                       prot->name);
2063                                goto out_free_request_sock_slab_name;
2064                        }
2065                }
2066
2067                if (prot->twsk_prot != NULL) {
2068                        static const char mask[] = "tw_sock_%s";
2069
2070                        prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2071
2072                        if (prot->twsk_prot->twsk_slab_name == NULL)
2073                                goto out_free_request_sock_slab;
2074
2075                        sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2076                        prot->twsk_prot->twsk_slab =
2077                                kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2078                                                  prot->twsk_prot->twsk_obj_size,
2079                                                  0, SLAB_HWCACHE_ALIGN,
2080                                                  NULL);
2081                        if (prot->twsk_prot->twsk_slab == NULL)
2082                                goto out_free_timewait_sock_slab_name;
2083                }
2084        }
2085
2086        write_lock(&proto_list_lock);
2087        list_add(&prot->node, &proto_list);
2088        assign_proto_idx(prot);
2089        write_unlock(&proto_list_lock);
2090        return 0;
2091
2092out_free_timewait_sock_slab_name:
2093        kfree(prot->twsk_prot->twsk_slab_name);
2094out_free_request_sock_slab:
2095        if (prot->rsk_prot && prot->rsk_prot->slab) {
2096                kmem_cache_destroy(prot->rsk_prot->slab);
2097                prot->rsk_prot->slab = NULL;
2098        }
2099out_free_request_sock_slab_name:
2100        kfree(prot->rsk_prot->slab_name);
2101out_free_sock_slab:
2102        kmem_cache_destroy(prot->slab);
2103        prot->slab = NULL;
2104out:
2105        return -ENOBUFS;
2106}
2107
2108EXPORT_SYMBOL(proto_register);
2109
2110void proto_unregister(struct proto *prot)
2111{
2112        write_lock(&proto_list_lock);
2113        release_proto_idx(prot);
2114        list_del(&prot->node);
2115        write_unlock(&proto_list_lock);
2116
2117        if (prot->slab != NULL) {
2118                kmem_cache_destroy(prot->slab);
2119                prot->slab = NULL;
2120        }
2121
2122        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2123                kmem_cache_destroy(prot->rsk_prot->slab);
2124                kfree(prot->rsk_prot->slab_name);
2125                prot->rsk_prot->slab = NULL;
2126        }
2127
2128        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2129                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2130                kfree(prot->twsk_prot->twsk_slab_name);
2131                prot->twsk_prot->twsk_slab = NULL;
2132        }
2133}
2134
2135EXPORT_SYMBOL(proto_unregister);
2136
2137#ifdef CONFIG_PROC_FS
2138static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2139        __acquires(proto_list_lock)
2140{
2141        read_lock(&proto_list_lock);
2142        return seq_list_start_head(&proto_list, *pos);
2143}
2144
2145static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2146{
2147        return seq_list_next(v, &proto_list, pos);
2148}
2149
2150static void proto_seq_stop(struct seq_file *seq, void *v)
2151        __releases(proto_list_lock)
2152{
2153        read_unlock(&proto_list_lock);
2154}
2155
2156static char proto_method_implemented(const void *method)
2157{
2158        return method == NULL ? 'n' : 'y';
2159}
2160
2161static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2162{
2163        seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2164                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2165                   proto->name,
2166                   proto->obj_size,
2167                   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
2168                   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2169                   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2170                   proto->max_header,
2171                   proto->slab == NULL ? "no" : "yes",
2172                   module_name(proto->owner),
2173                   proto_method_implemented(proto->close),
2174                   proto_method_implemented(proto->connect),
2175                   proto_method_implemented(proto->disconnect),
2176                   proto_method_implemented(proto->accept),
2177                   proto_method_implemented(proto->ioctl),
2178                   proto_method_implemented(proto->init),
2179                   proto_method_implemented(proto->destroy),
2180                   proto_method_implemented(proto->shutdown),
2181                   proto_method_implemented(proto->setsockopt),
2182                   proto_method_implemented(proto->getsockopt),
2183                   proto_method_implemented(proto->sendmsg),
2184                   proto_method_implemented(proto->recvmsg),
2185                   proto_method_implemented(proto->sendpage),
2186                   proto_method_implemented(proto->bind),
2187                   proto_method_implemented(proto->backlog_rcv),
2188                   proto_method_implemented(proto->hash),
2189                   proto_method_implemented(proto->unhash),
2190                   proto_method_implemented(proto->get_port),
2191                   proto_method_implemented(proto->enter_memory_pressure));
2192}
2193
2194static int proto_seq_show(struct seq_file *seq, void *v)
2195{
2196        if (v == &proto_list)
2197                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2198                           "protocol",
2199                           "size",
2200                           "sockets",
2201                           "memory",
2202                           "press",
2203                           "maxhdr",
2204                           "slab",
2205                           "module",
2206                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2207        else
2208                proto_seq_printf(seq, list_entry(v, struct proto, node));
2209        return 0;
2210}
2211
2212static const struct seq_operations proto_seq_ops = {
2213        .start  = proto_seq_start,
2214        .next   = proto_seq_next,
2215        .stop   = proto_seq_stop,
2216        .show   = proto_seq_show,
2217};
2218
2219static int proto_seq_open(struct inode *inode, struct file *file)
2220{
2221        return seq_open(file, &proto_seq_ops);
2222}
2223
2224static const struct file_operations proto_seq_fops = {
2225        .owner                = THIS_MODULE,
2226        .open                = proto_seq_open,
2227        .read                = seq_read,
2228        .llseek                = seq_lseek,
2229        .release        = seq_release,
2230};
2231
2232static int __init proto_init(void)
2233{
2234        /* register /proc/net/protocols */
2235        return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
2236}
2237
2238subsys_initcall(proto_init);
2239
2240#endif /* PROC_FS */
2241
2242EXPORT_SYMBOL(sk_alloc);
2243EXPORT_SYMBOL(sk_free);
2244EXPORT_SYMBOL(sk_send_sigurg);
2245EXPORT_SYMBOL(sock_alloc_send_skb);
2246EXPORT_SYMBOL(sock_init_data);
2247EXPORT_SYMBOL(sock_kfree_s);
2248EXPORT_SYMBOL(sock_kmalloc);
2249EXPORT_SYMBOL(sock_no_accept);
2250EXPORT_SYMBOL(sock_no_bind);
2251EXPORT_SYMBOL(sock_no_connect);
2252EXPORT_SYMBOL(sock_no_getname);
2253EXPORT_SYMBOL(sock_no_getsockopt);
2254EXPORT_SYMBOL(sock_no_ioctl);
2255EXPORT_SYMBOL(sock_no_listen);
2256EXPORT_SYMBOL(sock_no_mmap);
2257EXPORT_SYMBOL(sock_no_poll);
2258EXPORT_SYMBOL(sock_no_recvmsg);
2259EXPORT_SYMBOL(sock_no_sendmsg);
2260EXPORT_SYMBOL(sock_no_sendpage);
2261EXPORT_SYMBOL(sock_no_setsockopt);
2262EXPORT_SYMBOL(sock_no_shutdown);
2263EXPORT_SYMBOL(sock_no_socketpair);
2264EXPORT_SYMBOL(sock_rfree);
2265EXPORT_SYMBOL(sock_setsockopt);
2266EXPORT_SYMBOL(sock_wfree);
2267EXPORT_SYMBOL(sock_wmalloc);
2268EXPORT_SYMBOL(sock_i_uid);
2269EXPORT_SYMBOL(sock_i_ino);
2270EXPORT_SYMBOL(sysctl_optmem_max);