Showing error 1482

User: Jiri Slaby
Error type: Leaving function in locked state
Error type description: Some lock is not unlocked on all paths of a function, so it is leaked
File location: net/ipv4/tcp_ipv4.c
Line in file: 2003
Project: Linux Kernel
Project version: 2.6.28
Tools: Stanse (1.2)
Entered: 2012-05-21 20:30:05 UTC


Source:

   1/*
   2 * INET                An implementation of the TCP/IP protocol suite for the LINUX
   3 *                operating system.  INET is implemented using the  BSD Socket
   4 *                interface as the means of communication with the user level.
   5 *
   6 *                Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *                IPv4 specific functions
   9 *
  10 *
  11 *                code split from:
  12 *                linux/ipv4/tcp.c
  13 *                linux/ipv4/tcp_input.c
  14 *                linux/ipv4/tcp_output.c
  15 *
  16 *                See tcp.c for author information
  17 *
  18 *        This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *                David S. Miller        :        New socket lookup architecture.
  27 *                                        This code is dedicated to John Dyson.
  28 *                David S. Miller :        Change semantics of established hash,
  29 *                                        half is devoted to TIME_WAIT sockets
  30 *                                        and the rest go in the other half.
  31 *                Andi Kleen :                Add support for syncookies and fixed
  32 *                                        some bugs: ip options weren't passed to
  33 *                                        the TCP layer, missed a check for an
  34 *                                        ACK bit.
  35 *                Andi Kleen :                Implemented fast path mtu discovery.
  36 *                                             Fixed many serious bugs in the
  37 *                                        request_sock handling and moved
  38 *                                        most of it into the af independent code.
  39 *                                        Added tail drop and some other bugfixes.
  40 *                                        Added new listen semantics.
  41 *                Mike McLagan        :        Routing by source
  42 *        Juan Jose Ciarlante:                ip_dynaddr bits
  43 *                Andi Kleen:                various fixes.
  44 *        Vitaly E. Lavrov        :        Transparent proxy revived after year
  45 *                                        coma.
  46 *        Andi Kleen                :        Fix new listen.
  47 *        Andi Kleen                :        Fix accept error reporting.
  48 *        YOSHIFUJI Hideaki @USAGI and:        Support IPV6_V6ONLY socket option, which
  49 *        Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                        a single port at the same time.
  51 */
  52
  53
  54#include <linux/types.h>
  55#include <linux/fcntl.h>
  56#include <linux/module.h>
  57#include <linux/random.h>
  58#include <linux/cache.h>
  59#include <linux/jhash.h>
  60#include <linux/init.h>
  61#include <linux/times.h>
  62
  63#include <net/net_namespace.h>
  64#include <net/icmp.h>
  65#include <net/inet_hashtables.h>
  66#include <net/tcp.h>
  67#include <net/transp_v6.h>
  68#include <net/ipv6.h>
  69#include <net/inet_common.h>
  70#include <net/timewait_sock.h>
  71#include <net/xfrm.h>
  72#include <net/netdma.h>
  73
  74#include <linux/inet.h>
  75#include <linux/ipv6.h>
  76#include <linux/stddef.h>
  77#include <linux/proc_fs.h>
  78#include <linux/seq_file.h>
  79
  80#include <linux/crypto.h>
  81#include <linux/scatterlist.h>
  82
  83int sysctl_tcp_tw_reuse __read_mostly;
  84int sysctl_tcp_low_latency __read_mostly;
  85
  86
  87#ifdef CONFIG_TCP_MD5SIG
  88static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  89                                                   __be32 addr);
  90static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  91                               __be32 daddr, __be32 saddr, struct tcphdr *th);
  92#else
  93static inline
  94struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  95{
  96        return NULL;
  97}
  98#endif
  99
 100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 101        .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 102        .lhash_users = ATOMIC_INIT(0),
 103        .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 104};
 105
 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 107{
 108        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 109                                          ip_hdr(skb)->saddr,
 110                                          tcp_hdr(skb)->dest,
 111                                          tcp_hdr(skb)->source);
 112}
 113
 114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 115{
 116        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 117        struct tcp_sock *tp = tcp_sk(sk);
 118
 119        /* With PAWS, it is safe from the viewpoint
 120           of data integrity. Even without PAWS it is safe provided sequence
 121           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 122
 123           Actually, the idea is close to VJ's one, only timestamp cache is
 124           held not per host, but per port pair and TW bucket is used as state
 125           holder.
 126
 127           If TW bucket has been already destroyed we fall back to VJ's scheme
 128           and use initial timestamp retrieved from peer table.
 129         */
 130        if (tcptw->tw_ts_recent_stamp &&
 131            (twp == NULL || (sysctl_tcp_tw_reuse &&
 132                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 133                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 134                if (tp->write_seq == 0)
 135                        tp->write_seq = 1;
 136                tp->rx_opt.ts_recent           = tcptw->tw_ts_recent;
 137                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 138                sock_hold(sktw);
 139                return 1;
 140        }
 141
 142        return 0;
 143}
 144
 145EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 146
 147/* This will initiate an outgoing connection. */
 148int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 149{
 150        struct inet_sock *inet = inet_sk(sk);
 151        struct tcp_sock *tp = tcp_sk(sk);
 152        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153        struct rtable *rt;
 154        __be32 daddr, nexthop;
 155        int tmp;
 156        int err;
 157
 158        if (addr_len < sizeof(struct sockaddr_in))
 159                return -EINVAL;
 160
 161        if (usin->sin_family != AF_INET)
 162                return -EAFNOSUPPORT;
 163
 164        nexthop = daddr = usin->sin_addr.s_addr;
 165        if (inet->opt && inet->opt->srr) {
 166                if (!daddr)
 167                        return -EINVAL;
 168                nexthop = inet->opt->faddr;
 169        }
 170
 171        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 173                               IPPROTO_TCP,
 174                               inet->sport, usin->sin_port, sk, 1);
 175        if (tmp < 0) {
 176                if (tmp == -ENETUNREACH)
 177                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                return tmp;
 179        }
 180
 181        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                ip_rt_put(rt);
 183                return -ENETUNREACH;
 184        }
 185
 186        if (!inet->opt || !inet->opt->srr)
 187                daddr = rt->rt_dst;
 188
 189        if (!inet->saddr)
 190                inet->saddr = rt->rt_src;
 191        inet->rcv_saddr = inet->saddr;
 192
 193        if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 194                /* Reset inherited state */
 195                tp->rx_opt.ts_recent           = 0;
 196                tp->rx_opt.ts_recent_stamp = 0;
 197                tp->write_seq                   = 0;
 198        }
 199
 200        if (tcp_death_row.sysctl_tw_recycle &&
 201            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 202                struct inet_peer *peer = rt_get_peer(rt);
 203                /*
 204                 * VJ's idea. We save last timestamp seen from
 205                 * the destination in peer table, when entering state
 206                 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 207                 * when trying new connection.
 208                 */
 209                if (peer != NULL &&
 210                    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 211                        tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 212                        tp->rx_opt.ts_recent = peer->tcp_ts;
 213                }
 214        }
 215
 216        inet->dport = usin->sin_port;
 217        inet->daddr = daddr;
 218
 219        inet_csk(sk)->icsk_ext_hdr_len = 0;
 220        if (inet->opt)
 221                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 222
 223        tp->rx_opt.mss_clamp = 536;
 224
 225        /* Socket identity is still unknown (sport may be zero).
 226         * However we set state to SYN-SENT and not releasing socket
 227         * lock select source port, enter ourselves into the hash tables and
 228         * complete initialization after this.
 229         */
 230        tcp_set_state(sk, TCP_SYN_SENT);
 231        err = inet_hash_connect(&tcp_death_row, sk);
 232        if (err)
 233                goto failure;
 234
 235        err = ip_route_newports(&rt, IPPROTO_TCP,
 236                                inet->sport, inet->dport, sk);
 237        if (err)
 238                goto failure;
 239
 240        /* OK, now commit destination to socket.  */
 241        sk->sk_gso_type = SKB_GSO_TCPV4;
 242        sk_setup_caps(sk, &rt->u.dst);
 243
 244        if (!tp->write_seq)
 245                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 246                                                           inet->daddr,
 247                                                           inet->sport,
 248                                                           usin->sin_port);
 249
 250        inet->id = tp->write_seq ^ jiffies;
 251
 252        err = tcp_connect(sk);
 253        rt = NULL;
 254        if (err)
 255                goto failure;
 256
 257        return 0;
 258
 259failure:
 260        /*
 261         * This unhashes the socket and releases the local port,
 262         * if necessary.
 263         */
 264        tcp_set_state(sk, TCP_CLOSE);
 265        ip_rt_put(rt);
 266        sk->sk_route_caps = 0;
 267        inet->dport = 0;
 268        return err;
 269}
 270
 271/*
 272 * This routine does path mtu discovery as defined in RFC1191.
 273 */
 274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 275{
 276        struct dst_entry *dst;
 277        struct inet_sock *inet = inet_sk(sk);
 278
 279        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 280         * send out by Linux are always <576bytes so they should go through
 281         * unfragmented).
 282         */
 283        if (sk->sk_state == TCP_LISTEN)
 284                return;
 285
 286        /* We don't check in the destentry if pmtu discovery is forbidden
 287         * on this route. We just assume that no packet_to_big packets
 288         * are send back when pmtu discovery is not active.
 289         * There is a small race when the user changes this flag in the
 290         * route, but I think that's acceptable.
 291         */
 292        if ((dst = __sk_dst_check(sk, 0)) == NULL)
 293                return;
 294
 295        dst->ops->update_pmtu(dst, mtu);
 296
 297        /* Something is about to be wrong... Remember soft error
 298         * for the case, if this connection will not able to recover.
 299         */
 300        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 301                sk->sk_err_soft = EMSGSIZE;
 302
 303        mtu = dst_mtu(dst);
 304
 305        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 306            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 307                tcp_sync_mss(sk, mtu);
 308
 309                /* Resend the TCP packet because it's
 310                 * clear that the old packet has been
 311                 * dropped. This is the new "fast" path mtu
 312                 * discovery.
 313                 */
 314                tcp_simple_retransmit(sk);
 315        } /* else let the usual retransmit timer handle it */
 316}
 317
 318/*
 319 * This routine is called by the ICMP module when it gets some
 320 * sort of error condition.  If err < 0 then the socket should
 321 * be closed and the error returned to the user.  If err > 0
 322 * it's just the icmp type << 8 | icmp code.  After adjustment
 323 * header points to the first 8 bytes of the tcp header.  We need
 324 * to find the appropriate port.
 325 *
 326 * The locking strategy used here is very "optimistic". When
 327 * someone else accesses the socket the ICMP is just dropped
 328 * and for some paths there is no check at all.
 329 * A more general error queue to queue errors for later handling
 330 * is probably better.
 331 *
 332 */
 333
 334void tcp_v4_err(struct sk_buff *skb, u32 info)
 335{
 336        struct iphdr *iph = (struct iphdr *)skb->data;
 337        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 338        struct tcp_sock *tp;
 339        struct inet_sock *inet;
 340        const int type = icmp_hdr(skb)->type;
 341        const int code = icmp_hdr(skb)->code;
 342        struct sock *sk;
 343        __u32 seq;
 344        int err;
 345        struct net *net = dev_net(skb->dev);
 346
 347        if (skb->len < (iph->ihl << 2) + 8) {
 348                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 349                return;
 350        }
 351
 352        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 353                        iph->saddr, th->source, inet_iif(skb));
 354        if (!sk) {
 355                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 356                return;
 357        }
 358        if (sk->sk_state == TCP_TIME_WAIT) {
 359                inet_twsk_put(inet_twsk(sk));
 360                return;
 361        }
 362
 363        bh_lock_sock(sk);
 364        /* If too many ICMPs get dropped on busy
 365         * servers this needs to be solved differently.
 366         */
 367        if (sock_owned_by_user(sk))
 368                NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 369
 370        if (sk->sk_state == TCP_CLOSE)
 371                goto out;
 372
 373        tp = tcp_sk(sk);
 374        seq = ntohl(th->seq);
 375        if (sk->sk_state != TCP_LISTEN &&
 376            !between(seq, tp->snd_una, tp->snd_nxt)) {
 377                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 378                goto out;
 379        }
 380
 381        switch (type) {
 382        case ICMP_SOURCE_QUENCH:
 383                /* Just silently ignore these. */
 384                goto out;
 385        case ICMP_PARAMETERPROB:
 386                err = EPROTO;
 387                break;
 388        case ICMP_DEST_UNREACH:
 389                if (code > NR_ICMP_UNREACH)
 390                        goto out;
 391
 392                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 393                        if (!sock_owned_by_user(sk))
 394                                do_pmtu_discovery(sk, iph, info);
 395                        goto out;
 396                }
 397
 398                err = icmp_err_convert[code].errno;
 399                break;
 400        case ICMP_TIME_EXCEEDED:
 401                err = EHOSTUNREACH;
 402                break;
 403        default:
 404                goto out;
 405        }
 406
 407        switch (sk->sk_state) {
 408                struct request_sock *req, **prev;
 409        case TCP_LISTEN:
 410                if (sock_owned_by_user(sk))
 411                        goto out;
 412
 413                req = inet_csk_search_req(sk, &prev, th->dest,
 414                                          iph->daddr, iph->saddr);
 415                if (!req)
 416                        goto out;
 417
 418                /* ICMPs are not backlogged, hence we cannot get
 419                   an established socket here.
 420                 */
 421                WARN_ON(req->sk);
 422
 423                if (seq != tcp_rsk(req)->snt_isn) {
 424                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 425                        goto out;
 426                }
 427
 428                /*
 429                 * Still in SYN_RECV, just remove it silently.
 430                 * There is no good way to pass the error to the newly
 431                 * created socket, and POSIX does not want network
 432                 * errors returned from accept().
 433                 */
 434                inet_csk_reqsk_queue_drop(sk, req, prev);
 435                goto out;
 436
 437        case TCP_SYN_SENT:
 438        case TCP_SYN_RECV:  /* Cannot happen.
 439                               It can f.e. if SYNs crossed.
 440                             */
 441                if (!sock_owned_by_user(sk)) {
 442                        sk->sk_err = err;
 443
 444                        sk->sk_error_report(sk);
 445
 446                        tcp_done(sk);
 447                } else {
 448                        sk->sk_err_soft = err;
 449                }
 450                goto out;
 451        }
 452
 453        /* If we've already connected we will keep trying
 454         * until we time out, or the user gives up.
 455         *
 456         * rfc1122 4.2.3.9 allows to consider as hard errors
 457         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 458         * but it is obsoleted by pmtu discovery).
 459         *
 460         * Note, that in modern internet, where routing is unreliable
 461         * and in each dark corner broken firewalls sit, sending random
 462         * errors ordered by their masters even this two messages finally lose
 463         * their original sense (even Linux sends invalid PORT_UNREACHs)
 464         *
 465         * Now we are in compliance with RFCs.
 466         *                                                        --ANK (980905)
 467         */
 468
 469        inet = inet_sk(sk);
 470        if (!sock_owned_by_user(sk) && inet->recverr) {
 471                sk->sk_err = err;
 472                sk->sk_error_report(sk);
 473        } else        { /* Only an error on timeout */
 474                sk->sk_err_soft = err;
 475        }
 476
 477out:
 478        bh_unlock_sock(sk);
 479        sock_put(sk);
 480}
 481
 482/* This routine computes an IPv4 TCP checksum. */
 483void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 484{
 485        struct inet_sock *inet = inet_sk(sk);
 486        struct tcphdr *th = tcp_hdr(skb);
 487
 488        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 489                th->check = ~tcp_v4_check(len, inet->saddr,
 490                                          inet->daddr, 0);
 491                skb->csum_start = skb_transport_header(skb) - skb->head;
 492                skb->csum_offset = offsetof(struct tcphdr, check);
 493        } else {
 494                th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 495                                         csum_partial((char *)th,
 496                                                      th->doff << 2,
 497                                                      skb->csum));
 498        }
 499}
 500
 501int tcp_v4_gso_send_check(struct sk_buff *skb)
 502{
 503        const struct iphdr *iph;
 504        struct tcphdr *th;
 505
 506        if (!pskb_may_pull(skb, sizeof(*th)))
 507                return -EINVAL;
 508
 509        iph = ip_hdr(skb);
 510        th = tcp_hdr(skb);
 511
 512        th->check = 0;
 513        th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 514        skb->csum_start = skb_transport_header(skb) - skb->head;
 515        skb->csum_offset = offsetof(struct tcphdr, check);
 516        skb->ip_summed = CHECKSUM_PARTIAL;
 517        return 0;
 518}
 519
 520/*
 521 *        This routine will send an RST to the other tcp.
 522 *
 523 *        Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 524 *                      for reset.
 525 *        Answer: if a packet caused RST, it is not for a socket
 526 *                existing in our system, if it is matched to a socket,
 527 *                it is just duplicate segment or bug in other side's TCP.
 528 *                So that we build reply only basing on parameters
 529 *                arrived with segment.
 530 *        Exception: precedence violation. We do not implement it in any case.
 531 */
 532
 533static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 534{
 535        struct tcphdr *th = tcp_hdr(skb);
 536        struct {
 537                struct tcphdr th;
 538#ifdef CONFIG_TCP_MD5SIG
 539                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 540#endif
 541        } rep;
 542        struct ip_reply_arg arg;
 543#ifdef CONFIG_TCP_MD5SIG
 544        struct tcp_md5sig_key *key;
 545#endif
 546        struct net *net;
 547
 548        /* Never send a reset in response to a reset. */
 549        if (th->rst)
 550                return;
 551
 552        if (skb->rtable->rt_type != RTN_LOCAL)
 553                return;
 554
 555        /* Swap the send and the receive. */
 556        memset(&rep, 0, sizeof(rep));
 557        rep.th.dest   = th->source;
 558        rep.th.source = th->dest;
 559        rep.th.doff   = sizeof(struct tcphdr) / 4;
 560        rep.th.rst    = 1;
 561
 562        if (th->ack) {
 563                rep.th.seq = th->ack_seq;
 564        } else {
 565                rep.th.ack = 1;
 566                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 567                                       skb->len - (th->doff << 2));
 568        }
 569
 570        memset(&arg, 0, sizeof(arg));
 571        arg.iov[0].iov_base = (unsigned char *)&rep;
 572        arg.iov[0].iov_len  = sizeof(rep.th);
 573
 574#ifdef CONFIG_TCP_MD5SIG
 575        key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 576        if (key) {
 577                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 578                                   (TCPOPT_NOP << 16) |
 579                                   (TCPOPT_MD5SIG << 8) |
 580                                   TCPOLEN_MD5SIG);
 581                /* Update length and the length the header thinks exists */
 582                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 583                rep.th.doff = arg.iov[0].iov_len / 4;
 584
 585                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 586                                     key, ip_hdr(skb)->saddr,
 587                                     ip_hdr(skb)->daddr, &rep.th);
 588        }
 589#endif
 590        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 591                                      ip_hdr(skb)->saddr, /* XXX */
 592                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 593        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 594        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 595
 596        net = dev_net(skb->dst->dev);
 597        ip_send_reply(net->ipv4.tcp_sock, skb,
 598                      &arg, arg.iov[0].iov_len);
 599
 600        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 601        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 602}
 603
 604/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 605   outside socket context is ugly, certainly. What can I do?
 606 */
 607
 608static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 609                            u32 win, u32 ts, int oif,
 610                            struct tcp_md5sig_key *key,
 611                            int reply_flags)
 612{
 613        struct tcphdr *th = tcp_hdr(skb);
 614        struct {
 615                struct tcphdr th;
 616                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 617#ifdef CONFIG_TCP_MD5SIG
 618                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 619#endif
 620                        ];
 621        } rep;
 622        struct ip_reply_arg arg;
 623        struct net *net = dev_net(skb->dst->dev);
 624
 625        memset(&rep.th, 0, sizeof(struct tcphdr));
 626        memset(&arg, 0, sizeof(arg));
 627
 628        arg.iov[0].iov_base = (unsigned char *)&rep;
 629        arg.iov[0].iov_len  = sizeof(rep.th);
 630        if (ts) {
 631                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 632                                   (TCPOPT_TIMESTAMP << 8) |
 633                                   TCPOLEN_TIMESTAMP);
 634                rep.opt[1] = htonl(tcp_time_stamp);
 635                rep.opt[2] = htonl(ts);
 636                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 637        }
 638
 639        /* Swap the send and the receive. */
 640        rep.th.dest    = th->source;
 641        rep.th.source  = th->dest;
 642        rep.th.doff    = arg.iov[0].iov_len / 4;
 643        rep.th.seq     = htonl(seq);
 644        rep.th.ack_seq = htonl(ack);
 645        rep.th.ack     = 1;
 646        rep.th.window  = htons(win);
 647
 648#ifdef CONFIG_TCP_MD5SIG
 649        if (key) {
 650                int offset = (ts) ? 3 : 0;
 651
 652                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 653                                          (TCPOPT_NOP << 16) |
 654                                          (TCPOPT_MD5SIG << 8) |
 655                                          TCPOLEN_MD5SIG);
 656                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 657                rep.th.doff = arg.iov[0].iov_len/4;
 658
 659                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 660                                    key, ip_hdr(skb)->saddr,
 661                                    ip_hdr(skb)->daddr, &rep.th);
 662        }
 663#endif
 664        arg.flags = reply_flags;
 665        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 666                                      ip_hdr(skb)->saddr, /* XXX */
 667                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 668        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 669        if (oif)
 670                arg.bound_dev_if = oif;
 671
 672        ip_send_reply(net->ipv4.tcp_sock, skb,
 673                      &arg, arg.iov[0].iov_len);
 674
 675        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 676}
 677
 678static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 679{
 680        struct inet_timewait_sock *tw = inet_twsk(sk);
 681        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 682
 683        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 684                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 685                        tcptw->tw_ts_recent,
 686                        tw->tw_bound_dev_if,
 687                        tcp_twsk_md5_key(tcptw),
 688                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 689                        );
 690
 691        inet_twsk_put(tw);
 692}
 693
 694static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 695                                  struct request_sock *req)
 696{
 697        tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 698                        tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 699                        req->ts_recent,
 700                        0,
 701                        tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 702                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 703}
 704
 705/*
 706 *        Send a SYN-ACK after having received a SYN.
 707 *        This still operates on a request_sock only, not on a big
 708 *        socket.
 709 */
 710static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 711                                struct dst_entry *dst)
 712{
 713        const struct inet_request_sock *ireq = inet_rsk(req);
 714        int err = -1;
 715        struct sk_buff * skb;
 716
 717        /* First, grab a route. */
 718        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 719                return -1;
 720
 721        skb = tcp_make_synack(sk, dst, req);
 722
 723        if (skb) {
 724                struct tcphdr *th = tcp_hdr(skb);
 725
 726                th->check = tcp_v4_check(skb->len,
 727                                         ireq->loc_addr,
 728                                         ireq->rmt_addr,
 729                                         csum_partial((char *)th, skb->len,
 730                                                      skb->csum));
 731
 732                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 733                                            ireq->rmt_addr,
 734                                            ireq->opt);
 735                err = net_xmit_eval(err);
 736        }
 737
 738        dst_release(dst);
 739        return err;
 740}
 741
 742static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 743{
 744        return __tcp_v4_send_synack(sk, req, NULL);
 745}
 746
 747/*
 748 *        IPv4 request_sock destructor.
 749 */
 750static void tcp_v4_reqsk_destructor(struct request_sock *req)
 751{
 752        kfree(inet_rsk(req)->opt);
 753}
 754
 755#ifdef CONFIG_SYN_COOKIES
 756static void syn_flood_warning(struct sk_buff *skb)
 757{
 758        static unsigned long warntime;
 759
 760        if (time_after(jiffies, (warntime + HZ * 60))) {
 761                warntime = jiffies;
 762                printk(KERN_INFO
 763                       "possible SYN flooding on port %d. Sending cookies.\n",
 764                       ntohs(tcp_hdr(skb)->dest));
 765        }
 766}
 767#endif
 768
 769/*
 770 * Save and compile IPv4 options into the request_sock if needed.
 771 */
 772static struct ip_options *tcp_v4_save_options(struct sock *sk,
 773                                              struct sk_buff *skb)
 774{
 775        struct ip_options *opt = &(IPCB(skb)->opt);
 776        struct ip_options *dopt = NULL;
 777
 778        if (opt && opt->optlen) {
 779                int opt_size = optlength(opt);
 780                dopt = kmalloc(opt_size, GFP_ATOMIC);
 781                if (dopt) {
 782                        if (ip_options_echo(dopt, skb)) {
 783                                kfree(dopt);
 784                                dopt = NULL;
 785                        }
 786                }
 787        }
 788        return dopt;
 789}
 790
 791#ifdef CONFIG_TCP_MD5SIG
 792/*
 793 * RFC2385 MD5 checksumming requires a mapping of
 794 * IP address->MD5 Key.
 795 * We need to maintain these in the sk structure.
 796 */
 797
 798/* Find the Key structure for an address.  */
 799static struct tcp_md5sig_key *
 800                        tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 801{
 802        struct tcp_sock *tp = tcp_sk(sk);
 803        int i;
 804
 805        if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 806                return NULL;
 807        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 808                if (tp->md5sig_info->keys4[i].addr == addr)
 809                        return &tp->md5sig_info->keys4[i].base;
 810        }
 811        return NULL;
 812}
 813
 814struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 815                                         struct sock *addr_sk)
 816{
 817        return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 818}
 819
 820EXPORT_SYMBOL(tcp_v4_md5_lookup);
 821
 822static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 823                                                      struct request_sock *req)
 824{
 825        return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 826}
 827
 828/* This can be called on a newly created socket, from other files */
 829int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 830                      u8 *newkey, u8 newkeylen)
 831{
 832        /* Add Key to the list */
 833        struct tcp_md5sig_key *key;
 834        struct tcp_sock *tp = tcp_sk(sk);
 835        struct tcp4_md5sig_key *keys;
 836
 837        key = tcp_v4_md5_do_lookup(sk, addr);
 838        if (key) {
 839                /* Pre-existing entry - just update that one. */
 840                kfree(key->key);
 841                key->key = newkey;
 842                key->keylen = newkeylen;
 843        } else {
 844                struct tcp_md5sig_info *md5sig;
 845
 846                if (!tp->md5sig_info) {
 847                        tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 848                                                  GFP_ATOMIC);
 849                        if (!tp->md5sig_info) {
 850                                kfree(newkey);
 851                                return -ENOMEM;
 852                        }
 853                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 854                }
 855                if (tcp_alloc_md5sig_pool() == NULL) {
 856                        kfree(newkey);
 857                        return -ENOMEM;
 858                }
 859                md5sig = tp->md5sig_info;
 860
 861                if (md5sig->alloced4 == md5sig->entries4) {
 862                        keys = kmalloc((sizeof(*keys) *
 863                                        (md5sig->entries4 + 1)), GFP_ATOMIC);
 864                        if (!keys) {
 865                                kfree(newkey);
 866                                tcp_free_md5sig_pool();
 867                                return -ENOMEM;
 868                        }
 869
 870                        if (md5sig->entries4)
 871                                memcpy(keys, md5sig->keys4,
 872                                       sizeof(*keys) * md5sig->entries4);
 873
 874                        /* Free old key list, and reference new one */
 875                        kfree(md5sig->keys4);
 876                        md5sig->keys4 = keys;
 877                        md5sig->alloced4++;
 878                }
 879                md5sig->entries4++;
 880                md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 881                md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 882                md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 883        }
 884        return 0;
 885}
 886
 887EXPORT_SYMBOL(tcp_v4_md5_do_add);
 888
 889static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 890                               u8 *newkey, u8 newkeylen)
 891{
 892        return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 893                                 newkey, newkeylen);
 894}
 895
 896int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 897{
 898        struct tcp_sock *tp = tcp_sk(sk);
 899        int i;
 900
 901        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 902                if (tp->md5sig_info->keys4[i].addr == addr) {
 903                        /* Free the key */
 904                        kfree(tp->md5sig_info->keys4[i].base.key);
 905                        tp->md5sig_info->entries4--;
 906
 907                        if (tp->md5sig_info->entries4 == 0) {
 908                                kfree(tp->md5sig_info->keys4);
 909                                tp->md5sig_info->keys4 = NULL;
 910                                tp->md5sig_info->alloced4 = 0;
 911                        } else if (tp->md5sig_info->entries4 != i) {
 912                                /* Need to do some manipulation */
 913                                memmove(&tp->md5sig_info->keys4[i],
 914                                        &tp->md5sig_info->keys4[i+1],
 915                                        (tp->md5sig_info->entries4 - i) *
 916                                         sizeof(struct tcp4_md5sig_key));
 917                        }
 918                        tcp_free_md5sig_pool();
 919                        return 0;
 920                }
 921        }
 922        return -ENOENT;
 923}
 924
 925EXPORT_SYMBOL(tcp_v4_md5_do_del);
 926
 927static void tcp_v4_clear_md5_list(struct sock *sk)
 928{
 929        struct tcp_sock *tp = tcp_sk(sk);
 930
 931        /* Free each key, then the set of key keys,
 932         * the crypto element, and then decrement our
 933         * hold on the last resort crypto.
 934         */
 935        if (tp->md5sig_info->entries4) {
 936                int i;
 937                for (i = 0; i < tp->md5sig_info->entries4; i++)
 938                        kfree(tp->md5sig_info->keys4[i].base.key);
 939                tp->md5sig_info->entries4 = 0;
 940                tcp_free_md5sig_pool();
 941        }
 942        if (tp->md5sig_info->keys4) {
 943                kfree(tp->md5sig_info->keys4);
 944                tp->md5sig_info->keys4 = NULL;
 945                tp->md5sig_info->alloced4  = 0;
 946        }
 947}
 948
 949static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 950                                 int optlen)
 951{
 952        struct tcp_md5sig cmd;
 953        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 954        u8 *newkey;
 955
 956        if (optlen < sizeof(cmd))
 957                return -EINVAL;
 958
 959        if (copy_from_user(&cmd, optval, sizeof(cmd)))
 960                return -EFAULT;
 961
 962        if (sin->sin_family != AF_INET)
 963                return -EINVAL;
 964
 965        if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 966                if (!tcp_sk(sk)->md5sig_info)
 967                        return -ENOENT;
 968                return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
 969        }
 970
 971        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 972                return -EINVAL;
 973
 974        if (!tcp_sk(sk)->md5sig_info) {
 975                struct tcp_sock *tp = tcp_sk(sk);
 976                struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
 977
 978                if (!p)
 979                        return -EINVAL;
 980
 981                tp->md5sig_info = p;
 982                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 983        }
 984
 985        newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
 986        if (!newkey)
 987                return -ENOMEM;
 988        return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
 989                                 newkey, cmd.tcpm_keylen);
 990}
 991
 992static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
 993                                        __be32 daddr, __be32 saddr, int nbytes)
 994{
 995        struct tcp4_pseudohdr *bp;
 996        struct scatterlist sg;
 997
 998        bp = &hp->md5_blk.ip4;
 999
1000        /*
1001         * 1. the TCP pseudo-header (in the order: source IP address,
1002         * destination IP address, zero-padded protocol number, and
1003         * segment length)
1004         */
1005        bp->saddr = saddr;
1006        bp->daddr = daddr;
1007        bp->pad = 0;
1008        bp->protocol = IPPROTO_TCP;
1009        bp->len = cpu_to_be16(nbytes);
1010
1011        sg_init_one(&sg, bp, sizeof(*bp));
1012        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1013}
1014
1015static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1016                               __be32 daddr, __be32 saddr, struct tcphdr *th)
1017{
1018        struct tcp_md5sig_pool *hp;
1019        struct hash_desc *desc;
1020
1021        hp = tcp_get_md5sig_pool();
1022        if (!hp)
1023                goto clear_hash_noput;
1024        desc = &hp->md5_desc;
1025
1026        if (crypto_hash_init(desc))
1027                goto clear_hash;
1028        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1029                goto clear_hash;
1030        if (tcp_md5_hash_header(hp, th))
1031                goto clear_hash;
1032        if (tcp_md5_hash_key(hp, key))
1033                goto clear_hash;
1034        if (crypto_hash_final(desc, md5_hash))
1035                goto clear_hash;
1036
1037        tcp_put_md5sig_pool();
1038        return 0;
1039
1040clear_hash:
1041        tcp_put_md5sig_pool();
1042clear_hash_noput:
1043        memset(md5_hash, 0, 16);
1044        return 1;
1045}
1046
1047int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1048                        struct sock *sk, struct request_sock *req,
1049                        struct sk_buff *skb)
1050{
1051        struct tcp_md5sig_pool *hp;
1052        struct hash_desc *desc;
1053        struct tcphdr *th = tcp_hdr(skb);
1054        __be32 saddr, daddr;
1055
1056        if (sk) {
1057                saddr = inet_sk(sk)->saddr;
1058                daddr = inet_sk(sk)->daddr;
1059        } else if (req) {
1060                saddr = inet_rsk(req)->loc_addr;
1061                daddr = inet_rsk(req)->rmt_addr;
1062        } else {
1063                const struct iphdr *iph = ip_hdr(skb);
1064                saddr = iph->saddr;
1065                daddr = iph->daddr;
1066        }
1067
1068        hp = tcp_get_md5sig_pool();
1069        if (!hp)
1070                goto clear_hash_noput;
1071        desc = &hp->md5_desc;
1072
1073        if (crypto_hash_init(desc))
1074                goto clear_hash;
1075
1076        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1077                goto clear_hash;
1078        if (tcp_md5_hash_header(hp, th))
1079                goto clear_hash;
1080        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1081                goto clear_hash;
1082        if (tcp_md5_hash_key(hp, key))
1083                goto clear_hash;
1084        if (crypto_hash_final(desc, md5_hash))
1085                goto clear_hash;
1086
1087        tcp_put_md5sig_pool();
1088        return 0;
1089
1090clear_hash:
1091        tcp_put_md5sig_pool();
1092clear_hash_noput:
1093        memset(md5_hash, 0, 16);
1094        return 1;
1095}
1096
1097EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1098
1099static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1100{
1101        /*
1102         * This gets called for each TCP segment that arrives
1103         * so we want to be efficient.
1104         * We have 3 drop cases:
1105         * o No MD5 hash and one expected.
1106         * o MD5 hash and we're not expecting one.
1107         * o MD5 hash and its wrong.
1108         */
1109        __u8 *hash_location = NULL;
1110        struct tcp_md5sig_key *hash_expected;
1111        const struct iphdr *iph = ip_hdr(skb);
1112        struct tcphdr *th = tcp_hdr(skb);
1113        int genhash;
1114        unsigned char newhash[16];
1115
1116        hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1117        hash_location = tcp_parse_md5sig_option(th);
1118
1119        /* We've parsed the options - do we have a hash? */
1120        if (!hash_expected && !hash_location)
1121                return 0;
1122
1123        if (hash_expected && !hash_location) {
1124                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1125                return 1;
1126        }
1127
1128        if (!hash_expected && hash_location) {
1129                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1130                return 1;
1131        }
1132
1133        /* Okay, so this is hash_expected and hash_location -
1134         * so we need to calculate the checksum.
1135         */
1136        genhash = tcp_v4_md5_hash_skb(newhash,
1137                                      hash_expected,
1138                                      NULL, NULL, skb);
1139
1140        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1141                if (net_ratelimit()) {
1142                        printk(KERN_INFO "MD5 Hash failed for "
1143                               "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1144                               NIPQUAD(iph->saddr), ntohs(th->source),
1145                               NIPQUAD(iph->daddr), ntohs(th->dest),
1146                               genhash ? " tcp_v4_calc_md5_hash failed" : "");
1147                }
1148                return 1;
1149        }
1150        return 0;
1151}
1152
1153#endif
1154
1155struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1156        .family                =        PF_INET,
1157        .obj_size        =        sizeof(struct tcp_request_sock),
1158        .rtx_syn_ack        =        tcp_v4_send_synack,
1159        .send_ack        =        tcp_v4_reqsk_send_ack,
1160        .destructor        =        tcp_v4_reqsk_destructor,
1161        .send_reset        =        tcp_v4_send_reset,
1162};
1163
1164#ifdef CONFIG_TCP_MD5SIG
1165static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1166        .md5_lookup        =        tcp_v4_reqsk_md5_lookup,
1167};
1168#endif
1169
1170static struct timewait_sock_ops tcp_timewait_sock_ops = {
1171        .twsk_obj_size        = sizeof(struct tcp_timewait_sock),
1172        .twsk_unique        = tcp_twsk_unique,
1173        .twsk_destructor= tcp_twsk_destructor,
1174};
1175
1176int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1177{
1178        struct inet_request_sock *ireq;
1179        struct tcp_options_received tmp_opt;
1180        struct request_sock *req;
1181        __be32 saddr = ip_hdr(skb)->saddr;
1182        __be32 daddr = ip_hdr(skb)->daddr;
1183        __u32 isn = TCP_SKB_CB(skb)->when;
1184        struct dst_entry *dst = NULL;
1185#ifdef CONFIG_SYN_COOKIES
1186        int want_cookie = 0;
1187#else
1188#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1189#endif
1190
1191        /* Never answer to SYNs send to broadcast or multicast */
1192        if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1193                goto drop;
1194
1195        /* TW buckets are converted to open requests without
1196         * limitations, they conserve resources and peer is
1197         * evidently real one.
1198         */
1199        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1200#ifdef CONFIG_SYN_COOKIES
1201                if (sysctl_tcp_syncookies) {
1202                        want_cookie = 1;
1203                } else
1204#endif
1205                goto drop;
1206        }
1207
1208        /* Accept backlog is full. If we have already queued enough
1209         * of warm entries in syn queue, drop request. It is better than
1210         * clogging syn queue with openreqs with exponentially increasing
1211         * timeout.
1212         */
1213        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1214                goto drop;
1215
1216        req = inet_reqsk_alloc(&tcp_request_sock_ops);
1217        if (!req)
1218                goto drop;
1219
1220#ifdef CONFIG_TCP_MD5SIG
1221        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1222#endif
1223
1224        tcp_clear_options(&tmp_opt);
1225        tmp_opt.mss_clamp = 536;
1226        tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1227
1228        tcp_parse_options(skb, &tmp_opt, 0);
1229
1230        if (want_cookie && !tmp_opt.saw_tstamp)
1231                tcp_clear_options(&tmp_opt);
1232
1233        if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1234                /* Some OSes (unknown ones, but I see them on web server, which
1235                 * contains information interesting only for windows'
1236                 * users) do not send their stamp in SYN. It is easy case.
1237                 * We simply do not advertise TS support.
1238                 */
1239                tmp_opt.saw_tstamp = 0;
1240                tmp_opt.tstamp_ok  = 0;
1241        }
1242        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1243
1244        tcp_openreq_init(req, &tmp_opt, skb);
1245
1246        if (security_inet_conn_request(sk, skb, req))
1247                goto drop_and_free;
1248
1249        ireq = inet_rsk(req);
1250        ireq->loc_addr = daddr;
1251        ireq->rmt_addr = saddr;
1252        ireq->no_srccheck = inet_sk(sk)->transparent;
1253        ireq->opt = tcp_v4_save_options(sk, skb);
1254        if (!want_cookie)
1255                TCP_ECN_create_request(req, tcp_hdr(skb));
1256
1257        if (want_cookie) {
1258#ifdef CONFIG_SYN_COOKIES
1259                syn_flood_warning(skb);
1260                req->cookie_ts = tmp_opt.tstamp_ok;
1261#endif
1262                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1263        } else if (!isn) {
1264                struct inet_peer *peer = NULL;
1265
1266                /* VJ's idea. We save last timestamp seen
1267                 * from the destination in peer table, when entering
1268                 * state TIME-WAIT, and check against it before
1269                 * accepting new connection request.
1270                 *
1271                 * If "isn" is not zero, this request hit alive
1272                 * timewait bucket, so that all the necessary checks
1273                 * are made in the function processing timewait state.
1274                 */
1275                if (tmp_opt.saw_tstamp &&
1276                    tcp_death_row.sysctl_tw_recycle &&
1277                    (dst = inet_csk_route_req(sk, req)) != NULL &&
1278                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1279                    peer->v4daddr == saddr) {
1280                        if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1281                            (s32)(peer->tcp_ts - req->ts_recent) >
1282                                                        TCP_PAWS_WINDOW) {
1283                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1284                                goto drop_and_release;
1285                        }
1286                }
1287                /* Kill the following clause, if you dislike this way. */
1288                else if (!sysctl_tcp_syncookies &&
1289                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1290                          (sysctl_max_syn_backlog >> 2)) &&
1291                         (!peer || !peer->tcp_ts_stamp) &&
1292                         (!dst || !dst_metric(dst, RTAX_RTT))) {
1293                        /* Without syncookies last quarter of
1294                         * backlog is filled with destinations,
1295                         * proven to be alive.
1296                         * It means that we continue to communicate
1297                         * to destinations, already remembered
1298                         * to the moment of synflood.
1299                         */
1300                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1301                                       "request from " NIPQUAD_FMT "/%u\n",
1302                                       NIPQUAD(saddr),
1303                                       ntohs(tcp_hdr(skb)->source));
1304                        goto drop_and_release;
1305                }
1306
1307                isn = tcp_v4_init_sequence(skb);
1308        }
1309        tcp_rsk(req)->snt_isn = isn;
1310
1311        if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1312                goto drop_and_free;
1313
1314        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1315        return 0;
1316
1317drop_and_release:
1318        dst_release(dst);
1319drop_and_free:
1320        reqsk_free(req);
1321drop:
1322        return 0;
1323}
1324
1325
1326/*
1327 * The three way handshake has completed - we got a valid synack -
1328 * now create the new socket.
1329 */
1330struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1331                                  struct request_sock *req,
1332                                  struct dst_entry *dst)
1333{
1334        struct inet_request_sock *ireq;
1335        struct inet_sock *newinet;
1336        struct tcp_sock *newtp;
1337        struct sock *newsk;
1338#ifdef CONFIG_TCP_MD5SIG
1339        struct tcp_md5sig_key *key;
1340#endif
1341
1342        if (sk_acceptq_is_full(sk))
1343                goto exit_overflow;
1344
1345        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1346                goto exit;
1347
1348        newsk = tcp_create_openreq_child(sk, req, skb);
1349        if (!newsk)
1350                goto exit;
1351
1352        newsk->sk_gso_type = SKB_GSO_TCPV4;
1353        sk_setup_caps(newsk, dst);
1354
1355        newtp                      = tcp_sk(newsk);
1356        newinet                      = inet_sk(newsk);
1357        ireq                      = inet_rsk(req);
1358        newinet->daddr              = ireq->rmt_addr;
1359        newinet->rcv_saddr    = ireq->loc_addr;
1360        newinet->saddr              = ireq->loc_addr;
1361        newinet->opt              = ireq->opt;
1362        ireq->opt              = NULL;
1363        newinet->mc_index     = inet_iif(skb);
1364        newinet->mc_ttl              = ip_hdr(skb)->ttl;
1365        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1366        if (newinet->opt)
1367                inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1368        newinet->id = newtp->write_seq ^ jiffies;
1369
1370        tcp_mtup_init(newsk);
1371        tcp_sync_mss(newsk, dst_mtu(dst));
1372        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1373        if (tcp_sk(sk)->rx_opt.user_mss &&
1374            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1375                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1376
1377        tcp_initialize_rcv_mss(newsk);
1378
1379#ifdef CONFIG_TCP_MD5SIG
1380        /* Copy over the MD5 key from the original socket */
1381        if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1382                /*
1383                 * We're using one, so create a matching key
1384                 * on the newsk structure. If we fail to get
1385                 * memory, then we end up not copying the key
1386                 * across. Shucks.
1387                 */
1388                char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1389                if (newkey != NULL)
1390                        tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1391                                          newkey, key->keylen);
1392                newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1393        }
1394#endif
1395
1396        __inet_hash_nolisten(newsk);
1397        __inet_inherit_port(sk, newsk);
1398
1399        return newsk;
1400
1401exit_overflow:
1402        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1403exit:
1404        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1405        dst_release(dst);
1406        return NULL;
1407}
1408
1409static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1410{
1411        struct tcphdr *th = tcp_hdr(skb);
1412        const struct iphdr *iph = ip_hdr(skb);
1413        struct sock *nsk;
1414        struct request_sock **prev;
1415        /* Find possible connection requests. */
1416        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1417                                                       iph->saddr, iph->daddr);
1418        if (req)
1419                return tcp_check_req(sk, skb, req, prev);
1420
1421        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1422                        th->source, iph->daddr, th->dest, inet_iif(skb));
1423
1424        if (nsk) {
1425                if (nsk->sk_state != TCP_TIME_WAIT) {
1426                        bh_lock_sock(nsk);
1427                        return nsk;
1428                }
1429                inet_twsk_put(inet_twsk(nsk));
1430                return NULL;
1431        }
1432
1433#ifdef CONFIG_SYN_COOKIES
1434        if (!th->rst && !th->syn && th->ack)
1435                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1436#endif
1437        return sk;
1438}
1439
1440static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1441{
1442        const struct iphdr *iph = ip_hdr(skb);
1443
1444        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1445                if (!tcp_v4_check(skb->len, iph->saddr,
1446                                  iph->daddr, skb->csum)) {
1447                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1448                        return 0;
1449                }
1450        }
1451
1452        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1453                                       skb->len, IPPROTO_TCP, 0);
1454
1455        if (skb->len <= 76) {
1456                return __skb_checksum_complete(skb);
1457        }
1458        return 0;
1459}
1460
1461
1462/* The socket must have it's spinlock held when we get
1463 * here.
1464 *
1465 * We have a potential double-lock case here, so even when
1466 * doing backlog processing we use the BH locking scheme.
1467 * This is because we cannot sleep with the original spinlock
1468 * held.
1469 */
1470int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1471{
1472        struct sock *rsk;
1473#ifdef CONFIG_TCP_MD5SIG
1474        /*
1475         * We really want to reject the packet as early as possible
1476         * if:
1477         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1478         *  o There is an MD5 option and we're not expecting one
1479         */
1480        if (tcp_v4_inbound_md5_hash(sk, skb))
1481                goto discard;
1482#endif
1483
1484        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1485                TCP_CHECK_TIMER(sk);
1486                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1487                        rsk = sk;
1488                        goto reset;
1489                }
1490                TCP_CHECK_TIMER(sk);
1491                return 0;
1492        }
1493
1494        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1495                goto csum_err;
1496
1497        if (sk->sk_state == TCP_LISTEN) {
1498                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1499                if (!nsk)
1500                        goto discard;
1501
1502                if (nsk != sk) {
1503                        if (tcp_child_process(sk, nsk, skb)) {
1504                                rsk = nsk;
1505                                goto reset;
1506                        }
1507                        return 0;
1508                }
1509        }
1510
1511        TCP_CHECK_TIMER(sk);
1512        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1513                rsk = sk;
1514                goto reset;
1515        }
1516        TCP_CHECK_TIMER(sk);
1517        return 0;
1518
1519reset:
1520        tcp_v4_send_reset(rsk, skb);
1521discard:
1522        kfree_skb(skb);
1523        /* Be careful here. If this function gets more complicated and
1524         * gcc suffers from register pressure on the x86, sk (in %ebx)
1525         * might be destroyed here. This current version compiles correctly,
1526         * but you have been warned.
1527         */
1528        return 0;
1529
1530csum_err:
1531        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1532        goto discard;
1533}
1534
1535/*
1536 *        From tcp_input.c
1537 */
1538
1539int tcp_v4_rcv(struct sk_buff *skb)
1540{
1541        const struct iphdr *iph;
1542        struct tcphdr *th;
1543        struct sock *sk;
1544        int ret;
1545        struct net *net = dev_net(skb->dev);
1546
1547        if (skb->pkt_type != PACKET_HOST)
1548                goto discard_it;
1549
1550        /* Count it even if it's bad */
1551        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1552
1553        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554                goto discard_it;
1555
1556        th = tcp_hdr(skb);
1557
1558        if (th->doff < sizeof(struct tcphdr) / 4)
1559                goto bad_packet;
1560        if (!pskb_may_pull(skb, th->doff * 4))
1561                goto discard_it;
1562
1563        /* An explanation is required here, I think.
1564         * Packet length and doff are validated by header prediction,
1565         * provided case of th->doff==0 is eliminated.
1566         * So, we defer the checks. */
1567        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1568                goto bad_packet;
1569
1570        th = tcp_hdr(skb);
1571        iph = ip_hdr(skb);
1572        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1573        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1574                                    skb->len - th->doff * 4);
1575        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1576        TCP_SKB_CB(skb)->when         = 0;
1577        TCP_SKB_CB(skb)->flags         = iph->tos;
1578        TCP_SKB_CB(skb)->sacked         = 0;
1579
1580        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1581        if (!sk)
1582                goto no_tcp_socket;
1583
1584process:
1585        if (sk->sk_state == TCP_TIME_WAIT)
1586                goto do_time_wait;
1587
1588        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1589                goto discard_and_relse;
1590        nf_reset(skb);
1591
1592        if (sk_filter(sk, skb))
1593                goto discard_and_relse;
1594
1595        skb->dev = NULL;
1596
1597        bh_lock_sock_nested(sk);
1598        ret = 0;
1599        if (!sock_owned_by_user(sk)) {
1600#ifdef CONFIG_NET_DMA
1601                struct tcp_sock *tp = tcp_sk(sk);
1602                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1603                        tp->ucopy.dma_chan = get_softnet_dma();
1604                if (tp->ucopy.dma_chan)
1605                        ret = tcp_v4_do_rcv(sk, skb);
1606                else
1607#endif
1608                {
1609                        if (!tcp_prequeue(sk, skb))
1610                        ret = tcp_v4_do_rcv(sk, skb);
1611                }
1612        } else
1613                sk_add_backlog(sk, skb);
1614        bh_unlock_sock(sk);
1615
1616        sock_put(sk);
1617
1618        return ret;
1619
1620no_tcp_socket:
1621        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1622                goto discard_it;
1623
1624        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1625bad_packet:
1626                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1627        } else {
1628                tcp_v4_send_reset(NULL, skb);
1629        }
1630
1631discard_it:
1632        /* Discard frame. */
1633        kfree_skb(skb);
1634        return 0;
1635
1636discard_and_relse:
1637        sock_put(sk);
1638        goto discard_it;
1639
1640do_time_wait:
1641        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1642                inet_twsk_put(inet_twsk(sk));
1643                goto discard_it;
1644        }
1645
1646        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1647                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1648                inet_twsk_put(inet_twsk(sk));
1649                goto discard_it;
1650        }
1651        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1652        case TCP_TW_SYN: {
1653                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1654                                                        &tcp_hashinfo,
1655                                                        iph->daddr, th->dest,
1656                                                        inet_iif(skb));
1657                if (sk2) {
1658                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1659                        inet_twsk_put(inet_twsk(sk));
1660                        sk = sk2;
1661                        goto process;
1662                }
1663                /* Fall through to ACK */
1664        }
1665        case TCP_TW_ACK:
1666                tcp_v4_timewait_ack(sk, skb);
1667                break;
1668        case TCP_TW_RST:
1669                goto no_tcp_socket;
1670        case TCP_TW_SUCCESS:;
1671        }
1672        goto discard_it;
1673}
1674
1675/* VJ's idea. Save last timestamp seen from this destination
1676 * and hold it at least for normal timewait interval to use for duplicate
1677 * segment detection in subsequent connections, before they enter synchronized
1678 * state.
1679 */
1680
1681int tcp_v4_remember_stamp(struct sock *sk)
1682{
1683        struct inet_sock *inet = inet_sk(sk);
1684        struct tcp_sock *tp = tcp_sk(sk);
1685        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1686        struct inet_peer *peer = NULL;
1687        int release_it = 0;
1688
1689        if (!rt || rt->rt_dst != inet->daddr) {
1690                peer = inet_getpeer(inet->daddr, 1);
1691                release_it = 1;
1692        } else {
1693                if (!rt->peer)
1694                        rt_bind_peer(rt, 1);
1695                peer = rt->peer;
1696        }
1697
1698        if (peer) {
1699                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1700                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1701                     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1702                        peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1703                        peer->tcp_ts = tp->rx_opt.ts_recent;
1704                }
1705                if (release_it)
1706                        inet_putpeer(peer);
1707                return 1;
1708        }
1709
1710        return 0;
1711}
1712
1713int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1714{
1715        struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1716
1717        if (peer) {
1718                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1719
1720                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1721                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1722                     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1723                        peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1724                        peer->tcp_ts           = tcptw->tw_ts_recent;
1725                }
1726                inet_putpeer(peer);
1727                return 1;
1728        }
1729
1730        return 0;
1731}
1732
1733struct inet_connection_sock_af_ops ipv4_specific = {
1734        .queue_xmit           = ip_queue_xmit,
1735        .send_check           = tcp_v4_send_check,
1736        .rebuild_header           = inet_sk_rebuild_header,
1737        .conn_request           = tcp_v4_conn_request,
1738        .syn_recv_sock           = tcp_v4_syn_recv_sock,
1739        .remember_stamp           = tcp_v4_remember_stamp,
1740        .net_header_len           = sizeof(struct iphdr),
1741        .setsockopt           = ip_setsockopt,
1742        .getsockopt           = ip_getsockopt,
1743        .addr2sockaddr           = inet_csk_addr2sockaddr,
1744        .sockaddr_len           = sizeof(struct sockaddr_in),
1745        .bind_conflict           = inet_csk_bind_conflict,
1746#ifdef CONFIG_COMPAT
1747        .compat_setsockopt = compat_ip_setsockopt,
1748        .compat_getsockopt = compat_ip_getsockopt,
1749#endif
1750};
1751
1752#ifdef CONFIG_TCP_MD5SIG
1753static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1754        .md5_lookup                = tcp_v4_md5_lookup,
1755        .calc_md5_hash                = tcp_v4_md5_hash_skb,
1756        .md5_add                = tcp_v4_md5_add_func,
1757        .md5_parse                = tcp_v4_parse_md5_keys,
1758};
1759#endif
1760
1761/* NOTE: A lot of things set to zero explicitly by call to
1762 *       sk_alloc() so need not be done here.
1763 */
1764static int tcp_v4_init_sock(struct sock *sk)
1765{
1766        struct inet_connection_sock *icsk = inet_csk(sk);
1767        struct tcp_sock *tp = tcp_sk(sk);
1768
1769        skb_queue_head_init(&tp->out_of_order_queue);
1770        tcp_init_xmit_timers(sk);
1771        tcp_prequeue_init(tp);
1772
1773        icsk->icsk_rto = TCP_TIMEOUT_INIT;
1774        tp->mdev = TCP_TIMEOUT_INIT;
1775
1776        /* So many TCP implementations out there (incorrectly) count the
1777         * initial SYN frame in their delayed-ACK and congestion control
1778         * algorithms that we must have the following bandaid to talk
1779         * efficiently to them.  -DaveM
1780         */
1781        tp->snd_cwnd = 2;
1782
1783        /* See draft-stevens-tcpca-spec-01 for discussion of the
1784         * initialization of these values.
1785         */
1786        tp->snd_ssthresh = 0x7fffffff;        /* Infinity */
1787        tp->snd_cwnd_clamp = ~0;
1788        tp->mss_cache = 536;
1789
1790        tp->reordering = sysctl_tcp_reordering;
1791        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1792
1793        sk->sk_state = TCP_CLOSE;
1794
1795        sk->sk_write_space = sk_stream_write_space;
1796        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1797
1798        icsk->icsk_af_ops = &ipv4_specific;
1799        icsk->icsk_sync_mss = tcp_sync_mss;
1800#ifdef CONFIG_TCP_MD5SIG
1801        tp->af_specific = &tcp_sock_ipv4_specific;
1802#endif
1803
1804        sk->sk_sndbuf = sysctl_tcp_wmem[1];
1805        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1806
1807        atomic_inc(&tcp_sockets_allocated);
1808
1809        return 0;
1810}
1811
1812void tcp_v4_destroy_sock(struct sock *sk)
1813{
1814        struct tcp_sock *tp = tcp_sk(sk);
1815
1816        tcp_clear_xmit_timers(sk);
1817
1818        tcp_cleanup_congestion_control(sk);
1819
1820        /* Cleanup up the write buffer. */
1821        tcp_write_queue_purge(sk);
1822
1823        /* Cleans up our, hopefully empty, out_of_order_queue. */
1824        __skb_queue_purge(&tp->out_of_order_queue);
1825
1826#ifdef CONFIG_TCP_MD5SIG
1827        /* Clean up the MD5 key list, if any */
1828        if (tp->md5sig_info) {
1829                tcp_v4_clear_md5_list(sk);
1830                kfree(tp->md5sig_info);
1831                tp->md5sig_info = NULL;
1832        }
1833#endif
1834
1835#ifdef CONFIG_NET_DMA
1836        /* Cleans up our sk_async_wait_queue */
1837        __skb_queue_purge(&sk->sk_async_wait_queue);
1838#endif
1839
1840        /* Clean prequeue, it must be empty really */
1841        __skb_queue_purge(&tp->ucopy.prequeue);
1842
1843        /* Clean up a referenced TCP bind bucket. */
1844        if (inet_csk(sk)->icsk_bind_hash)
1845                inet_put_port(sk);
1846
1847        /*
1848         * If sendmsg cached page exists, toss it.
1849         */
1850        if (sk->sk_sndmsg_page) {
1851                __free_page(sk->sk_sndmsg_page);
1852                sk->sk_sndmsg_page = NULL;
1853        }
1854
1855        atomic_dec(&tcp_sockets_allocated);
1856}
1857
1858EXPORT_SYMBOL(tcp_v4_destroy_sock);
1859
1860#ifdef CONFIG_PROC_FS
1861/* Proc filesystem TCP sock list dumping. */
1862
1863static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1864{
1865        return hlist_empty(head) ? NULL :
1866                list_entry(head->first, struct inet_timewait_sock, tw_node);
1867}
1868
1869static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1870{
1871        return tw->tw_node.next ?
1872                hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1873}
1874
1875static void *listening_get_next(struct seq_file *seq, void *cur)
1876{
1877        struct inet_connection_sock *icsk;
1878        struct hlist_node *node;
1879        struct sock *sk = cur;
1880        struct tcp_iter_state* st = seq->private;
1881        struct net *net = seq_file_net(seq);
1882
1883        if (!sk) {
1884                st->bucket = 0;
1885                sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1886                goto get_sk;
1887        }
1888
1889        ++st->num;
1890
1891        if (st->state == TCP_SEQ_STATE_OPENREQ) {
1892                struct request_sock *req = cur;
1893
1894                icsk = inet_csk(st->syn_wait_sk);
1895                req = req->dl_next;
1896                while (1) {
1897                        while (req) {
1898                                if (req->rsk_ops->family == st->family) {
1899                                        cur = req;
1900                                        goto out;
1901                                }
1902                                req = req->dl_next;
1903                        }
1904                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1905                                break;
1906get_req:
1907                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1908                }
1909                sk          = sk_next(st->syn_wait_sk);
1910                st->state = TCP_SEQ_STATE_LISTENING;
1911                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1912        } else {
1913                icsk = inet_csk(sk);
1914                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1915                if (reqsk_queue_len(&icsk->icsk_accept_queue))
1916                        goto start_req;
1917                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1918                sk = sk_next(sk);
1919        }
1920get_sk:
1921        sk_for_each_from(sk, node) {
1922                if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1923                        cur = sk;
1924                        goto out;
1925                }
1926                icsk = inet_csk(sk);
1927                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1928                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1929start_req:
1930                        st->uid                = sock_i_uid(sk);
1931                        st->syn_wait_sk = sk;
1932                        st->state        = TCP_SEQ_STATE_OPENREQ;
1933                        st->sbucket        = 0;
1934                        goto get_req;
1935                }
1936                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1937        }
1938        if (++st->bucket < INET_LHTABLE_SIZE) {
1939                sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1940                goto get_sk;
1941        }
1942        cur = NULL;
1943out:
1944        return cur;
1945}
1946
1947static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1948{
1949        void *rc = listening_get_next(seq, NULL);
1950
1951        while (rc && *pos) {
1952                rc = listening_get_next(seq, rc);
1953                --*pos;
1954        }
1955        return rc;
1956}
1957
1958static inline int empty_bucket(struct tcp_iter_state *st)
1959{
1960        return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1961                hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1962}
1963
1964static void *established_get_first(struct seq_file *seq)
1965{
1966        struct tcp_iter_state* st = seq->private;
1967        struct net *net = seq_file_net(seq);
1968        void *rc = NULL;
1969
1970        for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1971                struct sock *sk;
1972                struct hlist_node *node;
1973                struct inet_timewait_sock *tw;
1974                rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1975
1976                /* Lockless fast path for the common case of empty buckets */
1977                if (empty_bucket(st))
1978                        continue;
1979
1980                read_lock_bh(lock);
1981                sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1982                        if (sk->sk_family != st->family ||
1983                            !net_eq(sock_net(sk), net)) {
1984                                continue;
1985                        }
1986                        rc = sk;
1987                        goto out;
1988                }
1989                st->state = TCP_SEQ_STATE_TIME_WAIT;
1990                inet_twsk_for_each(tw, node,
1991                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
1992                        if (tw->tw_family != st->family ||
1993                            !net_eq(twsk_net(tw), net)) {
1994                                continue;
1995                        }
1996                        rc = tw;
1997                        goto out;
1998                }
1999                read_unlock_bh(lock);
2000                st->state = TCP_SEQ_STATE_ESTABLISHED;
2001        }
2002out:
2003        return rc;
2004}
2005
2006static void *established_get_next(struct seq_file *seq, void *cur)
2007{
2008        struct sock *sk = cur;
2009        struct inet_timewait_sock *tw;
2010        struct hlist_node *node;
2011        struct tcp_iter_state* st = seq->private;
2012        struct net *net = seq_file_net(seq);
2013
2014        ++st->num;
2015
2016        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2017                tw = cur;
2018                tw = tw_next(tw);
2019get_tw:
2020                while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2021                        tw = tw_next(tw);
2022                }
2023                if (tw) {
2024                        cur = tw;
2025                        goto out;
2026                }
2027                read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2028                st->state = TCP_SEQ_STATE_ESTABLISHED;
2029
2030                /* Look for next non empty bucket */
2031                while (++st->bucket < tcp_hashinfo.ehash_size &&
2032                                empty_bucket(st))
2033                        ;
2034                if (st->bucket >= tcp_hashinfo.ehash_size)
2035                        return NULL;
2036
2037                read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2038                sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2039        } else
2040                sk = sk_next(sk);
2041
2042        sk_for_each_from(sk, node) {
2043                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2044                        goto found;
2045        }
2046
2047        st->state = TCP_SEQ_STATE_TIME_WAIT;
2048        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2049        goto get_tw;
2050found:
2051        cur = sk;
2052out:
2053        return cur;
2054}
2055
2056static void *established_get_idx(struct seq_file *seq, loff_t pos)
2057{
2058        void *rc = established_get_first(seq);
2059
2060        while (rc && pos) {
2061                rc = established_get_next(seq, rc);
2062                --pos;
2063        }
2064        return rc;
2065}
2066
2067static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2068{
2069        void *rc;
2070        struct tcp_iter_state* st = seq->private;
2071
2072        inet_listen_lock(&tcp_hashinfo);
2073        st->state = TCP_SEQ_STATE_LISTENING;
2074        rc          = listening_get_idx(seq, &pos);
2075
2076        if (!rc) {
2077                inet_listen_unlock(&tcp_hashinfo);
2078                st->state = TCP_SEQ_STATE_ESTABLISHED;
2079                rc          = established_get_idx(seq, pos);
2080        }
2081
2082        return rc;
2083}
2084
2085static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2086{
2087        struct tcp_iter_state* st = seq->private;
2088        st->state = TCP_SEQ_STATE_LISTENING;
2089        st->num = 0;
2090        return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2091}
2092
2093static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2094{
2095        void *rc = NULL;
2096        struct tcp_iter_state* st;
2097
2098        if (v == SEQ_START_TOKEN) {
2099                rc = tcp_get_idx(seq, 0);
2100                goto out;
2101        }
2102        st = seq->private;
2103
2104        switch (st->state) {
2105        case TCP_SEQ_STATE_OPENREQ:
2106        case TCP_SEQ_STATE_LISTENING:
2107                rc = listening_get_next(seq, v);
2108                if (!rc) {
2109                        inet_listen_unlock(&tcp_hashinfo);
2110                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2111                        rc          = established_get_first(seq);
2112                }
2113                break;
2114        case TCP_SEQ_STATE_ESTABLISHED:
2115        case TCP_SEQ_STATE_TIME_WAIT:
2116                rc = established_get_next(seq, v);
2117                break;
2118        }
2119out:
2120        ++*pos;
2121        return rc;
2122}
2123
2124static void tcp_seq_stop(struct seq_file *seq, void *v)
2125{
2126        struct tcp_iter_state* st = seq->private;
2127
2128        switch (st->state) {
2129        case TCP_SEQ_STATE_OPENREQ:
2130                if (v) {
2131                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2132                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2133                }
2134        case TCP_SEQ_STATE_LISTENING:
2135                if (v != SEQ_START_TOKEN)
2136                        inet_listen_unlock(&tcp_hashinfo);
2137                break;
2138        case TCP_SEQ_STATE_TIME_WAIT:
2139        case TCP_SEQ_STATE_ESTABLISHED:
2140                if (v)
2141                        read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2142                break;
2143        }
2144}
2145
2146static int tcp_seq_open(struct inode *inode, struct file *file)
2147{
2148        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2149        struct tcp_iter_state *s;
2150        int err;
2151
2152        err = seq_open_net(inode, file, &afinfo->seq_ops,
2153                          sizeof(struct tcp_iter_state));
2154        if (err < 0)
2155                return err;
2156
2157        s = ((struct seq_file *)file->private_data)->private;
2158        s->family                = afinfo->family;
2159        return 0;
2160}
2161
2162int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2163{
2164        int rc = 0;
2165        struct proc_dir_entry *p;
2166
2167        afinfo->seq_fops.open                = tcp_seq_open;
2168        afinfo->seq_fops.read                = seq_read;
2169        afinfo->seq_fops.llseek                = seq_lseek;
2170        afinfo->seq_fops.release        = seq_release_net;
2171
2172        afinfo->seq_ops.start                = tcp_seq_start;
2173        afinfo->seq_ops.next                = tcp_seq_next;
2174        afinfo->seq_ops.stop                = tcp_seq_stop;
2175
2176        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2177                             &afinfo->seq_fops, afinfo);
2178        if (!p)
2179                rc = -ENOMEM;
2180        return rc;
2181}
2182
2183void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2184{
2185        proc_net_remove(net, afinfo->name);
2186}
2187
2188static void get_openreq4(struct sock *sk, struct request_sock *req,
2189                         struct seq_file *f, int i, int uid, int *len)
2190{
2191        const struct inet_request_sock *ireq = inet_rsk(req);
2192        int ttd = req->expires - jiffies;
2193
2194        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2195                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2196                i,
2197                ireq->loc_addr,
2198                ntohs(inet_sk(sk)->sport),
2199                ireq->rmt_addr,
2200                ntohs(ireq->rmt_port),
2201                TCP_SYN_RECV,
2202                0, 0, /* could print option size, but that is af dependent. */
2203                1,    /* timers active (only the expire timer) */
2204                jiffies_to_clock_t(ttd),
2205                req->retrans,
2206                uid,
2207                0,  /* non standard timer */
2208                0, /* open_requests have no inode */
2209                atomic_read(&sk->sk_refcnt),
2210                req,
2211                len);
2212}
2213
2214static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2215{
2216        int timer_active;
2217        unsigned long timer_expires;
2218        struct tcp_sock *tp = tcp_sk(sk);
2219        const struct inet_connection_sock *icsk = inet_csk(sk);
2220        struct inet_sock *inet = inet_sk(sk);
2221        __be32 dest = inet->daddr;
2222        __be32 src = inet->rcv_saddr;
2223        __u16 destp = ntohs(inet->dport);
2224        __u16 srcp = ntohs(inet->sport);
2225
2226        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2227                timer_active        = 1;
2228                timer_expires        = icsk->icsk_timeout;
2229        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2230                timer_active        = 4;
2231                timer_expires        = icsk->icsk_timeout;
2232        } else if (timer_pending(&sk->sk_timer)) {
2233                timer_active        = 2;
2234                timer_expires        = sk->sk_timer.expires;
2235        } else {
2236                timer_active        = 0;
2237                timer_expires = jiffies;
2238        }
2239
2240        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2241                        "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2242                i, src, srcp, dest, destp, sk->sk_state,
2243                tp->write_seq - tp->snd_una,
2244                sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2245                                             (tp->rcv_nxt - tp->copied_seq),
2246                timer_active,
2247                jiffies_to_clock_t(timer_expires - jiffies),
2248                icsk->icsk_retransmits,
2249                sock_i_uid(sk),
2250                icsk->icsk_probes_out,
2251                sock_i_ino(sk),
2252                atomic_read(&sk->sk_refcnt), sk,
2253                jiffies_to_clock_t(icsk->icsk_rto),
2254                jiffies_to_clock_t(icsk->icsk_ack.ato),
2255                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2256                tp->snd_cwnd,
2257                tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2258                len);
2259}
2260
2261static void get_timewait4_sock(struct inet_timewait_sock *tw,
2262                               struct seq_file *f, int i, int *len)
2263{
2264        __be32 dest, src;
2265        __u16 destp, srcp;
2266        int ttd = tw->tw_ttd - jiffies;
2267
2268        if (ttd < 0)
2269                ttd = 0;
2270
2271        dest  = tw->tw_daddr;
2272        src   = tw->tw_rcv_saddr;
2273        destp = ntohs(tw->tw_dport);
2274        srcp  = ntohs(tw->tw_sport);
2275
2276        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2277                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2278                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2279                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2280                atomic_read(&tw->tw_refcnt), tw, len);
2281}
2282
2283#define TMPSZ 150
2284
2285static int tcp4_seq_show(struct seq_file *seq, void *v)
2286{
2287        struct tcp_iter_state* st;
2288        int len;
2289
2290        if (v == SEQ_START_TOKEN) {
2291                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2292                           "  sl  local_address rem_address   st tx_queue "
2293                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2294                           "inode");
2295                goto out;
2296        }
2297        st = seq->private;
2298
2299        switch (st->state) {
2300        case TCP_SEQ_STATE_LISTENING:
2301        case TCP_SEQ_STATE_ESTABLISHED:
2302                get_tcp4_sock(v, seq, st->num, &len);
2303                break;
2304        case TCP_SEQ_STATE_OPENREQ:
2305                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2306                break;
2307        case TCP_SEQ_STATE_TIME_WAIT:
2308                get_timewait4_sock(v, seq, st->num, &len);
2309                break;
2310        }
2311        seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2312out:
2313        return 0;
2314}
2315
2316static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2317        .name                = "tcp",
2318        .family                = AF_INET,
2319        .seq_fops        = {
2320                .owner                = THIS_MODULE,
2321        },
2322        .seq_ops        = {
2323                .show                = tcp4_seq_show,
2324        },
2325};
2326
2327static int tcp4_proc_init_net(struct net *net)
2328{
2329        return tcp_proc_register(net, &tcp4_seq_afinfo);
2330}
2331
2332static void tcp4_proc_exit_net(struct net *net)
2333{
2334        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2335}
2336
2337static struct pernet_operations tcp4_net_ops = {
2338        .init = tcp4_proc_init_net,
2339        .exit = tcp4_proc_exit_net,
2340};
2341
2342int __init tcp4_proc_init(void)
2343{
2344        return register_pernet_subsys(&tcp4_net_ops);
2345}
2346
2347void tcp4_proc_exit(void)
2348{
2349        unregister_pernet_subsys(&tcp4_net_ops);
2350}
2351#endif /* CONFIG_PROC_FS */
2352
2353struct proto tcp_prot = {
2354        .name                        = "TCP",
2355        .owner                        = THIS_MODULE,
2356        .close                        = tcp_close,
2357        .connect                = tcp_v4_connect,
2358        .disconnect                = tcp_disconnect,
2359        .accept                        = inet_csk_accept,
2360        .ioctl                        = tcp_ioctl,
2361        .init                        = tcp_v4_init_sock,
2362        .destroy                = tcp_v4_destroy_sock,
2363        .shutdown                = tcp_shutdown,
2364        .setsockopt                = tcp_setsockopt,
2365        .getsockopt                = tcp_getsockopt,
2366        .recvmsg                = tcp_recvmsg,
2367        .backlog_rcv                = tcp_v4_do_rcv,
2368        .hash                        = inet_hash,
2369        .unhash                        = inet_unhash,
2370        .get_port                = inet_csk_get_port,
2371        .enter_memory_pressure        = tcp_enter_memory_pressure,
2372        .sockets_allocated        = &tcp_sockets_allocated,
2373        .orphan_count                = &tcp_orphan_count,
2374        .memory_allocated        = &tcp_memory_allocated,
2375        .memory_pressure        = &tcp_memory_pressure,
2376        .sysctl_mem                = sysctl_tcp_mem,
2377        .sysctl_wmem                = sysctl_tcp_wmem,
2378        .sysctl_rmem                = sysctl_tcp_rmem,
2379        .max_header                = MAX_TCP_HEADER,
2380        .obj_size                = sizeof(struct tcp_sock),
2381        .twsk_prot                = &tcp_timewait_sock_ops,
2382        .rsk_prot                = &tcp_request_sock_ops,
2383        .h.hashinfo                = &tcp_hashinfo,
2384#ifdef CONFIG_COMPAT
2385        .compat_setsockopt        = compat_tcp_setsockopt,
2386        .compat_getsockopt        = compat_tcp_getsockopt,
2387#endif
2388};
2389
2390
2391static int __net_init tcp_sk_init(struct net *net)
2392{
2393        return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2394                                    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2395}
2396
2397static void __net_exit tcp_sk_exit(struct net *net)
2398{
2399        inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2400        inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2401}
2402
2403static struct pernet_operations __net_initdata tcp_sk_ops = {
2404       .init = tcp_sk_init,
2405       .exit = tcp_sk_exit,
2406};
2407
2408void __init tcp_v4_init(void)
2409{
2410        if (register_pernet_device(&tcp_sk_ops))
2411                panic("Failed to create the TCP control socket.\n");
2412}
2413
2414EXPORT_SYMBOL(ipv4_specific);
2415EXPORT_SYMBOL(tcp_hashinfo);
2416EXPORT_SYMBOL(tcp_prot);
2417EXPORT_SYMBOL(tcp_v4_conn_request);
2418EXPORT_SYMBOL(tcp_v4_connect);
2419EXPORT_SYMBOL(tcp_v4_do_rcv);
2420EXPORT_SYMBOL(tcp_v4_remember_stamp);
2421EXPORT_SYMBOL(tcp_v4_send_check);
2422EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2423
2424#ifdef CONFIG_PROC_FS
2425EXPORT_SYMBOL(tcp_proc_register);
2426EXPORT_SYMBOL(tcp_proc_unregister);
2427#endif
2428EXPORT_SYMBOL(sysctl_tcp_low_latency);
2429