Linux 协议栈分析 socket

Linux.协议栈分析.socket SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol){int retval;struct socket *sock;int flags;/* Check the SOCK_* constants for consistency.

文心叼虫

7794人浏览 · 2011-10-24 16:55:25

文心叼虫 · 2011-10-24 16:55:25 发布

1278~1281行就是取得type的值并检查是否合法。
我们知道socket对于用户的而言就是一个已经打开的特殊文件，而内核则为插口(socket)定义了一种特殊的文件类型形成特殊的文件系统sockfs(net/socket.c)，而sys_socket中调用的两个函数sock_create和sock_map_fd，可以看到这两个函数都共用一个sock参数，这便是为内核管理socket用的，而sock_map_fd明显是为用户提供已经打开的文件号。
sockfs的建立过程省略，sockfs的定义如下：

301

302

303

304

305

306

307

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {

.name = "sockfs",

.get_sb = sockfs_get_sb,

.kill_sb = kill_anon_super,

};

而所谓的通过socket函数创建一个插口，就是在sockfs中创建一个特殊文件，或者说是一个结点，并为实现相应插口功能建立一起一整套数据结构。所以首先就通过sock_create创建一个struct socket数据结构，然后通过sock_map_fd映射到一个已经打开的文件上。在分析sock_create和sock_map_fd之前先看看struct socket的定义

我们知道socket对于用户的而言就是一个已经打开的特殊文件，而内核则为插口(socket)定义了一种特殊的文件类型形成特殊的文件系统sockfs(net/socket.c)，而sys_socket中调用的两个函数sock_create和sock_map_fd，可以看到这两个函数都共用一个sock参数，这便是为内核管理socket用的，而sock_map_fd明显是为用户提供已经打开的文件号。
sockfs的建立过程省略，sockfs的定义如下：

static struct vfsmount *sock_mnt __read_mostly;
 
static struct file_system_type sock_fs_type = {
  .name =    "sockfs",
  .get_sb =  sockfs_get_sb,
  .kill_sb =  kill_anon_super,
};

/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @type: socket type (%SOCK_STREAM, etc)
 *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @fasync_list: Asynchronous wake up list
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wait: wait queue for several uses
 */
struct socket {
  socket_state    state;
 
  kmemcheck_bitfield_begin(type);
  short      type;
  kmemcheck_bitfield_end(type);
 
  unsigned long    flags;
  /*
   * Please keep fasync_list & wait fields in the same cache line
   */
  struct fasync_struct  *fasync_list;
  wait_queue_head_t  wait;
 
  struct file    *file;
  struct sock    *sk;
  const struct proto_ops  *ops;
};

struct proto_ops {
  int    family;
  struct module  *owner;
  int    (*release)   (struct socket *sock);
  int    (*bind)       (struct socket *sock,
              struct sockaddr *myaddr,
              int sockaddr_len);
  int    (*connect)   (struct socket *sock,
              struct sockaddr *vaddr,
              int sockaddr_len, int flags);
  int    (*socketpair)(struct socket *sock1,
              struct socket *sock2);
  int    (*accept)    (struct socket *sock,
              struct socket *newsock, int flags);
  int    (*getname)   (struct socket *sock,
              struct sockaddr *addr,
              int *sockaddr_len, int peer);
  unsigned int  (*poll)       (struct file *file, struct socket *sock,
              struct poll_table_struct *wait);
  int    (*ioctl)     (struct socket *sock, unsigned int cmd,
              unsigned long arg);
  int     (*compat_ioctl) (struct socket *sock, unsigned int cmd,
              unsigned long arg);
  int    (*listen)    (struct socket *sock, int len);
  int    (*shutdown)  (struct socket *sock, int flags);
  int    (*setsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, unsigned int optlen);
  int    (*getsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, int __user *optlen);
  int    (*compat_setsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, unsigned int optlen);
  int    (*compat_getsockopt)(struct socket *sock, int level,
              int optname, char __user *optval, int __user *optlen);
  int    (*sendmsg)   (struct kiocb *iocb, struct socket *sock,
              struct msghdr *m, size_t total_len);
  int    (*recvmsg)   (struct kiocb *iocb, struct socket *sock,
              struct msghdr *m, size_t total_len,
              int flags);
  int    (*mmap)       (struct file *file, struct socket *sock,
              struct vm_area_struct * vma);
  ssize_t    (*sendpage)  (struct socket *sock, struct page *page,
              int offset, size_t size, int flags);
  ssize_t   (*splice_read)(struct socket *sock,  loff_t *ppos,
               struct pipe_inode_info *pipe, size_t len, unsigned int flags);
};

接下来分析sock_create(net/socket.c)，sock_create会调用__sock_create。

static int __sock_create(struct net *net, int family, int type, int protocol,
       struct socket **res, int kern)
{
  int err;
  struct socket *sock;
  const struct net_proto_family *pf;
 
  /*
   *      Check protocol is in range
   */
  if (family < 0 || family >= NPROTO)
    return -EAFNOSUPPORT;
  if (type < 0 || type >= SOCK_MAX)
    return -EINVAL;
 
  /* Compatibility.
 
     This uglymoron is moved from INET layer to here to avoid
     deadlock in module load.
   */
  if (family == PF_INET && type == SOCK_PACKET) {
    static int warned;
    if (!warned) {
      warned = 1;
      printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
             current->comm);
    }
    family = PF_PACKET;
  }
 
  err = security_socket_create(family, type, protocol, kern);
  if (err)
    return err;
 
  /*
   *  Allocate the socket and allow the family to set things up. if
   *  the protocol is 0, the family is instructed to select an appropriate
   *  default.
   */
  sock = sock_alloc();
  if (!sock) {
    if (net_ratelimit())
      printk(KERN_WARNING "socket: no more sockets\n");
    return -ENFILE;  /* Not exactly a match, but its the
           closest posix thing */
  }
 
  sock->type = type;
 
#ifdef CONFIG_MODULES
  /* Attempt to load a protocol module if the find failed.
   *
   * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
   * requested real, full-featured networking support upon configuration.
   * Otherwise module support will break!
   */
  if (net_families[family] == NULL)
    request_module("net-pf-%d", family);
#endif
 
  rcu_read_lock();
  pf = rcu_dereference(net_families[family]);
  err = -EAFNOSUPPORT;
  if (!pf)
    goto out_release;
 
  /*
   * We will call the ->create function, that possibly is in a loadable
   * module, so we have to bump that loadable module refcnt first.
   */
  if (!try_module_get(pf->owner))
    goto out_release;
 
  /* Now protected by module ref count */
  rcu_read_unlock();
 
  err = pf->create(net, sock, protocol);
  if (err < 0)
    goto out_module_put;
 
  /*
   * Now to bump the refcnt of the [loadable] module that owns this
   * socket at sock_release time we decrement its refcnt.
   */
  if (!try_module_get(sock->ops->owner))
    goto out_module_busy;
 
  /*
   * Now that we're done with the ->create function, the [loadable]
   * module can have its refcnt decremented
   */
  module_put(pf->owner);
  err = security_socket_post_create(sock, family, type, protocol, kern);
  if (err)
    goto out_sock_release;
  *res = sock;
 
  return 0;
 
out_module_busy:
  err = -EAFNOSUPPORT;
out_module_put:
  sock->ops = NULL;
  module_put(pf->owner);
out_sock_release:
  sock_release(sock);
  return err;
 
out_release:
  rcu_read_unlock();
  goto out_sock_release;
}
 
int sock_create(int family, int type, int protocol, struct socket **res)
{
  return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

1150~1171行做的很简单，不过是参数检查。
接下来的security_socket_create以及后面的security_socket_post_create都定义在/include/linux/security.h中定义的空函数

static inline int security_socket_create(int family, int type,
           int protocol, int kern)
{
  return 0;
}
static inline int security_socket_post_create(struct socket *sock,
                int family,
                int type,
                int protocol, int kern)
{
  return 0;
}

1182行的sock_alloc的代码如下：

static struct socket *sock_alloc(void)
{
  struct inode *inode;
  struct socket *sock;
 
  inode = new_inode(sock_mnt->mnt_sb);
  if (!inode)
    return NULL;
 
  sock = SOCKET_I(inode);
 
  kmemcheck_annotate_bitfield(sock, type);
  inode->i_mode = S_IFSOCK | S_IRWXUGO;
  inode->i_uid = current_fsuid();
  inode->i_gid = current_fsgid();
 
  percpu_add(sockets_in_use, 1);
  return sock;
}

其中的new_inode是在/fs/inode.c中定义

static struct inode *alloc_inode(struct super_block *sb)
{
  struct inode *inode;
 
  if (sb->s_op->alloc_inode)
    inode = sb->s_op->alloc_inode(sb);
  else
    inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
  if (!inode)
    return NULL;
 
  if (unlikely(inode_init_always(sb, inode))) {
    if (inode->i_sb->s_op->destroy_inode)
      inode->i_sb->s_op->destroy_inode(inode);
    else
      kmem_cache_free(inode_cachep, inode);
    return NULL;
  }
 
  return inode;
}

struct inode *new_inode(struct super_block *sb)
{
  /*
   * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
   * error if st_ino won't fit in target struct field. Use 32bit counter
   * here to attempt to avoid that.
   */
  static unsigned int last_ino;
  struct inode *inode;
 
  spin_lock_prefetch(&inode_lock);
 
  inode = alloc_inode(sb);
  if (inode) {
    spin_lock(&inode_lock);
    __inode_add_to_lists(sb, NULL, inode);
    inode->i_ino = ++last_ino;
    inode->i_state = 0;
    spin_unlock(&inode_lock);
  }
  return inode;
}
EXPORT_SYMBOL(new_inode);

可以看出new_inode会调用alloc_inode分配inode，而alloc_inode会调用sockfs在VFS中注册的相应的函数来处理，那这个函数是什么呢？先来看一看/net/socket.c

static struct inode *sock_alloc_inode(struct super_block *sb)
{
  struct socket_alloc *ei;
 
  ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
  if (!ei)
    return NULL;
  init_waitqueue_head(&ei->socket.wait);
 
  ei->socket.fasync_list = NULL;
  ei->socket.state = SS_UNCONNECTED;
  ei->socket.flags = 0;
  ei->socket.ops = NULL;
  ei->socket.sk = NULL;
  ei->socket.file = NULL;
 
  return &ei->vfs_inode;
}

static const struct super_operations sockfs_ops = {
  .alloc_inode =  sock_alloc_inode,
  .destroy_inode =sock_destroy_inode,
  .statfs =  simple_statfs,
};

为帮助理解列出struct socket_alloc 结构体的定义。

struct socket_alloc {
  struct socket socket;
  struct inode vfs_inode;
};
 
static inline struct socket *SOCKET_I(struct inode *inode)
{
  return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

可以看到这个函数其实就是sock_alloc_inode,该函数分配了一个struct socket_alloc类型的结构体，然后返回这个结构体中的一个成员变量vfs_inode的地址，可以看出来这就是一个inode结构。然后就回到了sock_alloc函数的第489行，通过SOCKET_I获得与vfs_inode同在socket_alloc结构体中的成员socket的地址。然后程序返回到__sock_create的1190行。

1192开始的代码说明，如果编译内核开启了CONFIG_MODULES也就是内核模块的选项就先检查内核现在是否有支持由family(就是domain)所指定的网域的代码，如果没有则通过request_module来安装。

说到这里就先看看1204行的net_families这个数组，很明显它是控制和操作各个网域的一个控制结构体的集合，通过变量pf可以发现它的类型为struct net_proto_family(/include/linux/net.h)

struct net_proto_family {
  int    family;
  int    (*create)(struct net *net, struct socket *sock, int protocol);
  struct module  *owner;
};

然后1219行通过pf调用相应网域的create的函数，可以很简单地得出对于AF_UNIX, AF_INET, AF_INET6, AF_PACKET这些所对应的create函数肯定不一样。接下来我们以AF_INET为例说明。在/net/ipv4/af_inet.c中

static struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};

由936可以得出对于AF_inet其create函数为inet_create,定义于同一文件中。

static int inet_create(struct net *net, struct socket *sock, int protocol)
{
  struct sock *sk;
  struct inet_protosw *answer;
  struct inet_sock *inet;
  struct proto *answer_prot;
  unsigned char answer_flags;
  char answer_no_check;
  int try_loading_module = 0;
  int err;
 
  if (unlikely(!inet_ehash_secret))
    if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
      build_ehash_secret();
 
  sock->state = SS_UNCONNECTED;
 
  /* Look for the requested type/protocol pair. */
lookup_protocol:
  err = -ESOCKTNOSUPPORT;
  rcu_read_lock();
  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
 
    err = 0;
    /* Check the non-wild match. */
    if (protocol == answer->protocol) {
      if (protocol != IPPROTO_IP)
        break;
    } else {
      /* Check for the two wild cases. */
      if (IPPROTO_IP == protocol) {
        protocol = answer->protocol;
        break;
      }
      if (IPPROTO_IP == answer->protocol)
        break;
    }
    err = -EPROTONOSUPPORT;
  }
 
  if (unlikely(err)) {
    if (try_loading_module < 2) {
      rcu_read_unlock();
      /*
       * Be more specific, e.g. net-pf-2-proto-132-type-1
       * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
       */
      if (++try_loading_module == 1)
        request_module("net-pf-%d-proto-%d-type-%d",
                 PF_INET, protocol, sock->type);
      /*
       * Fall back to generic, e.g. net-pf-2-proto-132
       * (net-pf-PF_INET-proto-IPPROTO_SCTP)
       */
      else
        request_module("net-pf-%d-proto-%d",
                 PF_INET, protocol);
      goto lookup_protocol;
    } else
      goto out_rcu_unlock;
  }
 
  err = -EPERM;
  if (answer->capability > 0 && !capable(answer->capability))
    goto out_rcu_unlock;
 
  err = -EAFNOSUPPORT;
  if (!inet_netns_ok(net, protocol))
    goto out_rcu_unlock;
 
  sock->ops = answer->ops;
  answer_prot = answer->prot;
  answer_no_check = answer->no_check;
  answer_flags = answer->flags;
  rcu_read_unlock();
 
  WARN_ON(answer_prot->slab == NULL);
 
  err = -ENOBUFS;
  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
  if (sk == NULL)
    goto out;
 
  err = 0;
  sk->sk_no_check = answer_no_check;
  if (INET_PROTOSW_REUSE & answer_flags)
    sk->sk_reuse = 1;
 
  inet = inet_sk(sk);
  inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
  if (SOCK_RAW == sock->type) {
    inet->num = protocol;
    if (IPPROTO_RAW == protocol)
      inet->hdrincl = 1;
  }
 
  if (ipv4_config.no_pmtu_disc)
    inet->pmtudisc = IP_PMTUDISC_DONT;
  else
    inet->pmtudisc = IP_PMTUDISC_WANT;
 
  inet->id = 0;
 
  sock_init_data(sock, sk);
 
  sk->sk_destruct     = inet_sock_destruct;
  sk->sk_protocol     = protocol;
  sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
 
  inet->uc_ttl  = -1;
  inet->mc_loop  = 1;
  inet->mc_ttl  = 1;
  inet->mc_all  = 1;
  inet->mc_index  = 0;
  inet->mc_list  = NULL;
 
  sk_refcnt_debug_inc(sk);
 
  if (inet->num) {
    /* It assumes that any protocol which allows
     * the user to assign a number at socket
     * creation time automatically
     * shares.
     */
    inet->sport = htons(inet->num);
    /* Add to protocol hash chains. */
    sk->sk_prot->hash(sk);
  }
 
  if (sk->sk_prot->init) {
    err = sk->sk_prot->init(sk);
    if (err)
      sk_common_release(sk);
  }
out:
  return err;
out_rcu_unlock:
  rcu_read_unlock();
  goto out;
}

每283到325就是通过type和protocol从inetsw中找出对应的struct inet_protosw的结构体。inetsw是定义于(net/ipv4/af_inet.c)中定义的

/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);

而对于struct inet_protosw是在/include/net/protocol.h中定义

/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {
  struct list_head list;
 
        /* These two fields form the lookup key.  */
  unsigned short   type;     /* This is the 2nd argument to socket(2). */
  unsigned short   protocol; /* This is the L4 protocol number.  */
 
  struct proto   *prot;
  const struct proto_ops *ops;
 
  int              capability; /* Which (if any) capability do
              * we need to use this socket
              * interface?
                                      */
  char             no_check;   /* checksum on rcv/xmit/none? */
  unsigned char   flags;      /* See INET_PROTOSW_* below.  */
};

inetsw其实是就是Linux内核的典型的组织链表结构的一个数组，是按type组织的。inetsw是通过inet_register_protosw初始化的

void inet_register_protosw(struct inet_protosw *p)
{
  struct list_head *lh;
  struct inet_protosw *answer;
  int protocol = p->protocol;
  struct list_head *last_perm;
 
  spin_lock_bh(&inetsw_lock);
 
  if (p->type >= SOCK_MAX)
    goto out_illegal;
 
  /* If we are trying to override a permanent protocol, bail. */
  answer = NULL;
  last_perm = &inetsw[p->type];
  list_for_each(lh, &inetsw[p->type]) {
    answer = list_entry(lh, struct inet_protosw, list);
 
    /* Check only the non-wild match. */
    if (INET_PROTOSW_PERMANENT & answer->flags) {
      if (protocol == answer->protocol)
        break;
      last_perm = lh;
    }
 
    answer = NULL;
  }
  if (answer)
    goto out_permanent;
 
  /* Add the new entry after the last permanent entry if any, so that
   * the new entry does not override a permanent entry when matched with
   * a wild-card protocol. But it is allowed to override any existing
   * non-permanent entry.  This means that when we remove this entry, the
   * system automatically returns to the old behavior.
   */
  list_add_rcu(&p->list, last_perm);
out:
  spin_unlock_bh(&inetsw_lock);
 
  return;
 
out_permanent:
  printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
         protocol);
  goto out;
 
out_illegal:
  printk(KERN_ERR
         "Ignoring attempt to register invalid socket type %d.\n",
         p->type);
  goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

对于inet_register_protosw的调用是在inet_init中的第1593行进行的。

static int __init inet_init(void)
{
  struct sk_buff *dummy_skb;
  struct inet_protosw *q;
  struct list_head *r;
  int rc = -EINVAL;
 
  BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
  rc = proto_register(&tcp_prot, 1);
  if (rc)
    goto out;
 
  rc = proto_register(&udp_prot, 1);
  if (rc)
    goto out_unregister_tcp_proto;
 
  rc = proto_register(&raw_prot, 1);
  if (rc)
    goto out_unregister_udp_proto;
 
  /*
   *  Tell SOCKET that we are alive...
   */
 
  (void)sock_register(&inet_family_ops);
 
#ifdef CONFIG_SYSCTL
  ip_static_sysctl_init();
#endif
 
  /*
   *  Add all the base protocols.
   */
 
  if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
  if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
  if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
#ifdef CONFIG_IP_MULTICAST
  if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
    printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
#endif
 
  /* Register the socket-side information for inet_create. */
  for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
    INIT_LIST_HEAD(r);
 
  for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
    inet_register_protosw(q);
 
  /*
   *  Set the ARP module up
   */
 
  arp_init();
 
  /*
   *  Set the IP module up
   */
 
  ip_init();
 
  tcp_v4_init();
 
  /* Setup TCP slab cache for open requests. */
  tcp_init();
 
  /* Setup UDP memory threshold */
  udp_init();
 
  /* Add UDP-Lite (RFC 3828) */
  udplite4_register();
 
  /*
   *  Set the ICMP layer up
   */
 
  if (icmp_init() < 0)
    panic("Failed to create the ICMP control socket.\n");
 
  /*
   *  Initialise the multicast router
   */
#if defined(CONFIG_IP_MROUTE)
  if (ip_mr_init())
    printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
#endif
  /*
   *  Initialise per-cpu ipv4 mibs
   */
 
  if (init_ipv4_mibs())
    printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
 
  ipv4_proc_init();
 
  ipfrag_init();
 
  dev_add_pack(&ip_packet_type);
 
  rc = 0;
out:
  return rc;
out_unregister_udp_proto:
  proto_unregister(&udp_prot);
out_unregister_tcp_proto:
  proto_unregister(&tcp_prot);
  goto out;
}
 
fs_initcall(inet_init);

从1592行可以看出初始化inetsw是用的inetsw_array数组，再看看inetsw_array数组。

const struct proto_ops inet_stream_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_stream_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = inet_accept,
  .getname     = inet_getname,
  .poll       = tcp_poll,
  .ioctl       = inet_ioctl,
  .listen       = inet_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = tcp_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = tcp_sendpage,
  .splice_read     = tcp_splice_read,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
 
const struct proto_ops inet_dgram_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_dgram_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = sock_no_accept,
  .getname     = inet_getname,
  .poll       = udp_poll,
  .ioctl       = inet_ioctl,
  .listen       = sock_no_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = inet_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = inet_sendpage,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
 
/*
 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
 * udp_poll
 */
static const struct proto_ops inet_sockraw_ops = {
  .family       = PF_INET,
  .owner       = THIS_MODULE,
  .release     = inet_release,
  .bind       = inet_bind,
  .connect     = inet_dgram_connect,
  .socketpair     = sock_no_socketpair,
  .accept       = sock_no_accept,
  .getname     = inet_getname,
  .poll       = datagram_poll,
  .ioctl       = inet_ioctl,
  .listen       = sock_no_listen,
  .shutdown     = inet_shutdown,
  .setsockopt     = sock_common_setsockopt,
  .getsockopt     = sock_common_getsockopt,
  .sendmsg     = inet_sendmsg,
  .recvmsg     = sock_common_recvmsg,
  .mmap       = sock_no_mmap,
  .sendpage     = inet_sendpage,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_sock_common_setsockopt,
  .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
 
static struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};
 
/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
  {
    .type =       SOCK_STREAM,
    .protocol =   IPPROTO_TCP,
    .prot =       &tcp_prot,
    .ops =        &inet_stream_ops,
    .capability = -1,
    .no_check =   0,
    .flags =      INET_PROTOSW_PERMANENT |
            INET_PROTOSW_ICSK,
  },
 
  {
    .type =       SOCK_DGRAM,
    .protocol =   IPPROTO_UDP,
    .prot =       &udp_prot,
    .ops =        &inet_dgram_ops,
    .capability = -1,
    .no_check =   UDP_CSUM_DEFAULT,
    .flags =      INET_PROTOSW_PERMANENT,
       },
 
 
       {
         .type =       SOCK_RAW,
         .protocol =   IPPROTO_IP,  /* wild card */
         .prot =       &raw_prot,
         .ops =        &inet_sockraw_ops,
         .capability = CAP_NET_RAW,
         .no_check =   UDP_CSUM_DEFAULT,
         .flags =      INET_PROTOSW_REUSE,
       }
};
 
#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

假设我们分析ipv4中的TCP协议，其它协议也可以参照分析。现在回到inet_create函数，这个函数最重要的一行就是335，这一行的作用就是初始化套接口socket所应该对应的操作函数。例如如果用socket(AF_INET, SOCK_STREAM, 0);创建套接字，则内核就会在这里为这个套接字关联上相应的TCP的操作函数集inet_stream_ops，以后在这个套接字上的数据的各种操作如accept listen bind send recv都会通过这些函数完成。
接下来在inet_create中的344后就是分配一个struct sock结构体，这个sock结构和socket结构是一一对应的，两个结构各有一个成员指向对方。struct sock是在include/net/sock.h中定义，它有两个非常重要的成员sk_receive_queue和sk_write_queue。还有两个成员sk_rcvbuf,sk_sndbuf分别代表接收和发送缓冲区的大小，默认是32767字节，是在sock_init_data(net/core/sock.c)中初始化的。另外对于有连接模式可能要求超时重传，所以还有一个sk_timer的定时队列。

/**
  *  struct sock - network layer representation of sockets
  *  @__sk_common: shared layout with inet_timewait_sock
  *  @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *  @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *  @sk_lock:  synchronizer
  *  @sk_rcvbuf: size of receive buffer in bytes
  *  @sk_sleep: sock wait queue
  *  @sk_dst_cache: destination cache
  *  @sk_dst_lock: destination cache lock
  *  @sk_policy: flow policy
  *  @sk_rmem_alloc: receive queue bytes committed
  *  @sk_receive_queue: incoming packets
  *  @sk_wmem_alloc: transmit queue bytes committed
  *  @sk_write_queue: Packet sending queue
  *  @sk_async_wait_queue: DMA copied packets
  *  @sk_omem_alloc: "o" is "option" or "other"
  *  @sk_wmem_queued: persistent queue size
  *  @sk_forward_alloc: space allocated forward
  *  @sk_allocation: allocation mode
  *  @sk_sndbuf: size of send buffer in bytes
  *  @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *       %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  *  @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
  *  @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *  @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *  @sk_gso_max_size: Maximum GSO segment size to build
  *  @sk_lingertime: %SO_LINGER l_linger setting
  *  @sk_backlog: always used with the per-socket spinlock held
  *  @sk_callback_lock: used with the callbacks in the end of this struct
  *  @sk_error_queue: rarely used
  *  @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *        IPV6_ADDRFORM for instance)
  *  @sk_err: last error
  *  @sk_err_soft: errors that don't cause failure but are the cause of a
  *          persistent failure not just 'timed out'
  *  @sk_drops: raw/udp drops counter
  *  @sk_ack_backlog: current listen backlog
  *  @sk_max_ack_backlog: listen backlog set in listen()
  *  @sk_priority: %SO_PRIORITY setting
  *  @sk_type: socket type (%SOCK_STREAM, etc)
  *  @sk_protocol: which protocol this socket belongs in this network family
  *  @sk_peercred: %SO_PEERCRED setting
  *  @sk_rcvlowat: %SO_RCVLOWAT setting
  *  @sk_rcvtimeo: %SO_RCVTIMEO setting
  *  @sk_sndtimeo: %SO_SNDTIMEO setting
  *  @sk_filter: socket filtering instructions
  *  @sk_protinfo: private area, net family specific, when not using slab
  *  @sk_timer: sock cleanup timer
  *  @sk_stamp: time stamp of last packet received
  *  @sk_socket: Identd and reporting IO signals
  *  @sk_user_data: RPC layer private data
  *  @sk_sndmsg_page: cached page for sendmsg
  *  @sk_sndmsg_off: cached offset for sendmsg
  *  @sk_send_head: front of stuff to transmit
  *  @sk_security: used by security modules
  *  @sk_mark: generic packet mark
  *  @sk_write_pending: a write to stream socket waits to start
  *  @sk_state_change: callback to indicate change in the state of the sock
  *  @sk_data_ready: callback to indicate there is data to be processed
  *  @sk_write_space: callback to indicate there is bf sending space available
  *  @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *  @sk_backlog_rcv: callback to process the backlog
  *  @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
 */
struct sock {
  /*
   * Now struct inet_timewait_sock also uses sock_common, so please just
   * don't add nothing before this first member (__sk_common) --acme
   */
  struct sock_common  __sk_common;
#define sk_node      __sk_common.skc_node
#define sk_nulls_node    __sk_common.skc_nulls_node
#define sk_refcnt    __sk_common.skc_refcnt
 
#define sk_copy_start    __sk_common.skc_hash
#define sk_hash      __sk_common.skc_hash
#define sk_family    __sk_common.skc_family
#define sk_state    __sk_common.skc_state
#define sk_reuse    __sk_common.skc_reuse
#define sk_bound_dev_if    __sk_common.skc_bound_dev_if
#define sk_bind_node    __sk_common.skc_bind_node
#define sk_prot      __sk_common.skc_prot
#define sk_net      __sk_common.skc_net
  kmemcheck_bitfield_begin(flags);
  unsigned int    sk_shutdown  : 2,
        sk_no_check  : 2,
        sk_userlocks : 4,
        sk_protocol  : 8,
        sk_type      : 16;
  kmemcheck_bitfield_end(flags);
  int      sk_rcvbuf;
  socket_lock_t    sk_lock;
  /*
   * The backlog queue is special, it is always used with
   * the per-socket spinlock held and requires low latency
   * access. Therefore we special case it's implementation.
   */
  struct {
    struct sk_buff *head;
    struct sk_buff *tail;
  } sk_backlog;
  wait_queue_head_t  *sk_sleep;
  struct dst_entry  *sk_dst_cache;
#ifdef CONFIG_XFRM
  struct xfrm_policy  *sk_policy[2];
#endif
  rwlock_t    sk_dst_lock;
  atomic_t    sk_rmem_alloc;
  atomic_t    sk_wmem_alloc;
  atomic_t    sk_omem_alloc;
  int      sk_sndbuf;
  struct sk_buff_head  sk_receive_queue;
  struct sk_buff_head  sk_write_queue;
#ifdef CONFIG_NET_DMA
  struct sk_buff_head  sk_async_wait_queue;
#endif
  int      sk_wmem_queued;
  int      sk_forward_alloc;
  gfp_t      sk_allocation;
  int      sk_route_caps;
  int      sk_gso_type;
  unsigned int    sk_gso_max_size;
  int      sk_rcvlowat;
  unsigned long     sk_flags;
  unsigned long          sk_lingertime;
  struct sk_buff_head  sk_error_queue;
  struct proto    *sk_prot_creator;
  rwlock_t    sk_callback_lock;
  int      sk_err,
        sk_err_soft;
  atomic_t    sk_drops;
  unsigned short    sk_ack_backlog;
  unsigned short    sk_max_ack_backlog;
  __u32      sk_priority;
  struct ucred    sk_peercred;
  long      sk_rcvtimeo;
  long      sk_sndtimeo;
  struct sk_filter        *sk_filter;
  void      *sk_protinfo;
  struct timer_list  sk_timer;
  ktime_t      sk_stamp;
  struct socket    *sk_socket;
  void      *sk_user_data;
  struct page    *sk_sndmsg_page;
  struct sk_buff    *sk_send_head;
  __u32      sk_sndmsg_off;
  int      sk_write_pending;
#ifdef CONFIG_SECURITY
  void      *sk_security;
#endif
  __u32      sk_mark;
  /* XXX 4 bytes hole on 64 bit */
  void      (*sk_state_change)(struct sock *sk);
  void      (*sk_data_ready)(struct sock *sk, int bytes);
  void      (*sk_write_space)(struct sock *sk);
  void      (*sk_error_report)(struct sock *sk);
    int      (*sk_backlog_rcv)(struct sock *sk,
              struct sk_buff *skb);
  void                    (*sk_destruct)(struct sock *sk);
};

在分析sk_alloc之前先分析一下answer_prot. answer_prot是struct proto类型(include/net/sock.h)

/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 * transport -> network interface is defined by struct inet_proto
 */
struct proto {
  void      (*close)(struct sock *sk,
          long timeout);
  int      (*connect)(struct sock *sk,
                struct sockaddr *uaddr,
          int addr_len);
  int      (*disconnect)(struct sock *sk, int flags);
 
  struct sock *    (*accept) (struct sock *sk, int flags, int *err);
 
  int      (*ioctl)(struct sock *sk, int cmd,
           unsigned long arg);
  int      (*init)(struct sock *sk);
  void      (*destroy)(struct sock *sk);
  void      (*shutdown)(struct sock *sk, int how);
  int      (*setsockopt)(struct sock *sk, int level,
          int optname, char __user *optval,
          unsigned int optlen);
  int      (*getsockopt)(struct sock *sk, int level,
          int optname, char __user *optval,
          int __user *option);
#ifdef CONFIG_COMPAT
  int      (*compat_setsockopt)(struct sock *sk,
          int level,
          int optname, char __user *optval,
          unsigned int optlen);
  int      (*compat_getsockopt)(struct sock *sk,
          int level,
          int optname, char __user *optval,
          int __user *option);
#endif
  int      (*sendmsg)(struct kiocb *iocb, struct sock *sk,
             struct msghdr *msg, size_t len);
  int      (*recvmsg)(struct kiocb *iocb, struct sock *sk,
             struct msghdr *msg,
          size_t len, int noblock, int flags,
          int *addr_len);
  int      (*sendpage)(struct sock *sk, struct page *page,
          int offset, size_t size, int flags);
  int      (*bind)(struct sock *sk,
          struct sockaddr *uaddr, int addr_len);
 
  int      (*backlog_rcv) (struct sock *sk,
            struct sk_buff *skb);
 
  /* Keeping track of sk's, looking them up, and port selection methods. */
  void      (*hash)(struct sock *sk);
  void      (*unhash)(struct sock *sk);
  int      (*get_port)(struct sock *sk, unsigned short snum);
 
  /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
  unsigned int    inuse_idx;
#endif
 
  /* Memory pressure */
  void      (*enter_memory_pressure)(struct sock *sk);
  atomic_t    *memory_allocated;  /* Current allocated memory. */
  struct percpu_counter  *sockets_allocated;  /* Current number of sockets. */
  /*
   * Pressure flag: try to collapse.
   * Technical note: it is used by multiple contexts non atomically.
   * All the __sk_mem_schedule() is of this nature: accounting
   * is strict, actions are advisory and have some latency.
   */
  int      *memory_pressure;
  int      *sysctl_mem;
  int      *sysctl_wmem;
  int      *sysctl_rmem;
  int      max_header;
 
  struct kmem_cache  *slab;
  unsigned int    obj_size;
  int      slab_flags;
 
  struct percpu_counter  *orphan_count;
 
  struct request_sock_ops  *rsk_prot;
  struct timewait_sock_ops *twsk_prot;
 
  union {
    struct inet_hashinfo  *hashinfo;
    struct udp_table  *udp_table;
    struct raw_hashinfo  *raw_hash;
  } h;
 
  struct module    *owner;
 
  char      name[32];
 
  struct list_head  node;
#ifdef SOCK_REFCNT_DEBUG
  atomic_t    socks;
#endif
};

假设分析的是TCP协议，则通过336行的赋值从inetsw_array找到其prot成员变量为tcp_prot(net/ipv4/tcp_ipv4.h)。

struct proto tcp_prot = {
  .name      = "TCP",
  .owner      = THIS_MODULE,
  .close      = tcp_close,
  .connect    = tcp_v4_connect,
  .disconnect    = tcp_disconnect,
  .accept      = inet_csk_accept,
  .ioctl      = tcp_ioctl,
  .init      = tcp_v4_init_sock,
  .destroy    = tcp_v4_destroy_sock,
  .shutdown    = tcp_shutdown,
  .setsockopt    = tcp_setsockopt,
  .getsockopt    = tcp_getsockopt,
  .recvmsg    = tcp_recvmsg,
  .backlog_rcv    = tcp_v4_do_rcv,
  .hash      = inet_hash,
  .unhash      = inet_unhash,
  .get_port    = inet_csk_get_port,
  .enter_memory_pressure  = tcp_enter_memory_pressure,
  .sockets_allocated  = &tcp_sockets_allocated,
  .orphan_count    = &tcp_orphan_count,
  .memory_allocated  = &tcp_memory_allocated,
  .memory_pressure  = &tcp_memory_pressure,
  .sysctl_mem    = sysctl_tcp_mem,
  .sysctl_wmem    = sysctl_tcp_wmem,
  .sysctl_rmem    = sysctl_tcp_rmem,
  .max_header    = MAX_TCP_HEADER,
  .obj_size    = sizeof(struct tcp_sock),
  .slab_flags    = SLAB_DESTROY_BY_RCU,
  .twsk_prot    = &tcp_timewait_sock_ops,
  .rsk_prot    = &tcp_request_sock_ops,
  .h.hashinfo    = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
  .compat_setsockopt  = compat_tcp_setsockopt,
  .compat_getsockopt  = compat_tcp_getsockopt,
#endif
};

通过tcp_prot的结构体对各成员的赋值可以发现并没有初始化，而obj_size被初始化为sizeof(struct tcp_sock)这一点可以在后面的分析中看到。接下来看inet_create的344行，即sk_alloc(net/ipv4/af_inet.c)。

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
    int family)
{
  struct sock *sk;
  struct kmem_cache *slab;
 
  slab = prot->slab;
  if (slab != NULL) {
    sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
    if (!sk)
      return sk;
    if (priority & __GFP_ZERO) {
      /*
       * caches using SLAB_DESTROY_BY_RCU should let
       * sk_node.next un-modified. Special care is taken
       * when initializing object to zero.
       */
      if (offsetof(struct sock, sk_node.next) != 0)
        memset(sk, 0, offsetof(struct sock, sk_node.next));
      memset(&sk->sk_node.pprev, 0,
             prot->obj_size - offsetof(struct sock,
               sk_node.pprev));
    }
  }
  else
    sk = kmalloc(prot->obj_size, priority);
 
  if (sk != NULL) {
    kmemcheck_annotate_bitfield(sk, flags);
 
    if (security_sk_alloc(sk, family, priority))
      goto out_free;
 
    if (!try_module_get(prot->owner))
      goto out_free_sec;
  }
 
  return sk;
 
out_free_sec:
  security_sk_free(sk);
out_free:
  if (slab != NULL)
    kmem_cache_free(slab, sk);
  else
    kfree(sk);
  return NULL;
}

/**
 *  sk_alloc - All socket objects are allocated here
 *  @net: the applicable net namespace
 *  @family: protocol family
 *  @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *  @prot: struct proto associated with this new sock instance
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
          struct proto *prot)
{
  struct sock *sk;
 
  sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
  if (sk) {
    sk->sk_family = family;
    /*
     * See comment in struct sock definition to understand
     * why we need sk_prot_creator -acme
     */
    sk->sk_prot = sk->sk_prot_creator = prot;
    sock_lock_init(sk);
    sock_net_set(sk, get_net(net));
    atomic_set(&sk->sk_wmem_alloc, 1);
  }
 
  return sk;
}
EXPORT_SYMBOL(sk_alloc);

很明显在sk_alloc中直接调用sk_prot_alloc来分配sock结构，在sk_prot_alloc中先判定slab是否为空(如前提示)，由于tcp_prot并未初始化slab所以直接分配obj_size大小即sizeof(struct tcp_sock)的空间，并返回空间类型为struct sock *的地址，但是又可以看到该空间的大小为sizeof(struct tcp_sock)，那就说明有两种情况：一、sizeof(struct tcp_sock) == sizeof(struct sock) 二、sizeof(struct tcp_sock) >= sizeof(struct sock) 。通过分析实际是第二种情况，通过列出一系列数据结构可以很明显地看出。
先来看struct tcp_sock结构的定义(include/linux/tcp.h)

struct tcp_sock {
  /* inet_connection_sock has to be the first member of tcp_sock */
  struct inet_connection_sock  inet_conn;
  u16  tcp_header_len;  /* Bytes of tcp header to send    */
  u16  xmit_size_goal_segs; /* Goal for segmenting output packets */
 
/*
 *  Header prediction flags
 *  0x5?10 << 16 + snd_wnd in net byte order
 */
  __be32  pred_flags;
 
/*
 *  RFC793 variables by their proper names. This means you can
 *  read the code and the spec side by side (and laugh ...)
 *  See RFC793 and RFC1122. The RFC writes these in capitals.
 */
   u32  rcv_nxt;  /* What we want to receive next   */
  u32  copied_seq;  /* Head of yet unread data    */
  u32  rcv_wup;  /* rcv_nxt on last window update sent  */
   u32  snd_nxt;  /* Next sequence we send    */
 
   u32  snd_una;  /* First byte we want an ack for  */
   u32  snd_sml;  /* Last byte of the most recently transmitted small packet */
  u32  rcv_tstamp;  /* timestamp of last received ACK (for keepalives) */
  u32  lsndtime;  /* timestamp of last sent data packet (for restart window) */
 
  /* Data for direct copy to user */
  struct {
    struct sk_buff_head  prequeue;
    struct task_struct  *task;
    struct iovec    *iov;
    int      memory;
    int      len;
#ifdef CONFIG_NET_DMA
    /* members for async copy */
    struct dma_chan    *dma_chan;
    int      wakeup;
    struct dma_pinned_list  *pinned_list;
    dma_cookie_t    dma_cookie;
#endif
  } ucopy;
 
  u32  snd_wl1;  /* Sequence for window update    */
  u32  snd_wnd;  /* The window we expect to receive  */
  u32  max_window;  /* Maximal window ever seen from peer  */
  u32  mss_cache;  /* Cached effective mss, not including SACKS */
 
  u32  window_clamp;  /* Maximal window to advertise    */
  u32  rcv_ssthresh;  /* Current window clamp      */
 
  u32  frto_highmark;  /* snd_nxt when RTO occurred */
  u16  advmss;    /* Advertised MSS      */
  u8  frto_counter;  /* Number of new acks after RTO */
  u8  nonagle;  /* Disable Nagle algorithm?             */
 
/* RTT measurement */
  u32  srtt;    /* smoothed round trip time << 3  */
  u32  mdev;    /* medium deviation      */
  u32  mdev_max;  /* maximal mdev for the last rtt period  */
  u32  rttvar;    /* smoothed mdev_max      */
  u32  rtt_seq;  /* sequence number to update rttvar  */
 
  u32  packets_out;  /* Packets which are "in flight"  */
  u32  retrans_out;  /* Retransmitted packets out    */
 
  u16  urg_data;  /* Saved octet of OOB data and control flags */
  u8  ecn_flags;  /* ECN status bits.      */
  u8  reordering;  /* Packet reordering metric.    */
  u32  snd_up;    /* Urgent pointer    */
 
  u8  keepalive_probes; /* num of allowed keep alive probes  */
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
  struct tcp_options_received rx_opt;
 
/*
 *  Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
   u32  snd_ssthresh;  /* Slow start size threshold    */
   u32  snd_cwnd;  /* Sending congestion window    */
  u32  snd_cwnd_cnt;  /* Linear increase counter    */
  u32  snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
  u32  snd_cwnd_used;
  u32  snd_cwnd_stamp;
 
   u32  rcv_wnd;  /* Current receiver window    */
  u32  write_seq;  /* Tail(+1) of data held in tcp send buffer */
  u32  pushed_seq;  /* Last pushed seq, required to talk to windows */
  u32  lost_out;  /* Lost packets      */
  u32  sacked_out;  /* SACK'd packets      */
  u32  fackets_out;  /* FACK'd packets      */
  u32  tso_deferred;
  u32  bytes_acked;  /* Appropriate Byte Counting - RFC3465 */
 
  /* from STCP, retrans queue hinting */
  struct sk_buff* lost_skb_hint;
  struct sk_buff *scoreboard_skb_hint;
  struct sk_buff *retransmit_skb_hint;
 
  struct sk_buff_head  out_of_order_queue; /* Out of order segments go here */
 
  /* SACKs data, these 2 need to be together (see tcp_build_and_update_options) */
  struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
  struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
  struct tcp_sack_block recv_sack_cache[4];
 
  struct sk_buff *highest_sack;   /* highest skb with SACK received
           * (validity guaranteed only if
           * sacked_out > 0)
           */
 
  int     lost_cnt_hint;
  u32     retransmit_high;  /* L-bits may be on up to this seqno */
 
  u32  lost_retrans_low;  /* Sent seq after any rxmit (lowest) */
 
  u32  prior_ssthresh; /* ssthresh saved at recovery start  */
  u32  high_seq;  /* snd_nxt at onset of congestion  */
 
  u32  retrans_stamp;  /* Timestamp of the last retransmit,
         * also used in SYN-SENT to remember stamp of
         * the first SYN. */
  u32  undo_marker;  /* tracking retrans started here. */
  int  undo_retrans;  /* number of undoable retransmissions. */
  u32  total_retrans;  /* Total retransmits for entire connection */
 
  u32  urg_seq;  /* Seq of received urgent pointer */
  unsigned int    keepalive_time;    /* time before keep alive takes place */
  unsigned int    keepalive_intvl;  /* time interval between keep alive probes */
 
  int      linger2;
 
/* Receiver side RTT estimation */
  struct {
    u32  rtt;
    u32  seq;
    u32  time;
  } rcv_rtt_est;
 
/* Receiver queue space */
  struct {
    int  space;
    u32  seq;
    u32  time;
  } rcvq_space;
 
/* TCP-specific MTU probe information. */
  struct {
    u32      probe_seq_start;
    u32      probe_seq_end;
  } mtu_probe;
 
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
  const struct tcp_sock_af_ops  *af_specific;
 
/* TCP MD5 Signature Option information */
  struct tcp_md5sig_info  *md5sig_info;
#endif
};

在tcp_sock的结构体的第一个成员变量类型为struct inet_connection_sock(include/net/inet_connection_sock.h)

/** inet_connection_sock - INET connection oriented sock
 *
 * @icsk_accept_queue:     FIFO of established children
 * @icsk_bind_hash:     Bind node
 * @icsk_timeout:     Timeout
 * @icsk_retransmit_timer: Resend (no ack)
 * @icsk_rto:       Retransmit timeout
 * @icsk_pmtu_cookie     Last pmtu seen by socket
 * @icsk_ca_ops       Pluggable congestion control hook
 * @icsk_af_ops       Operations which are AF_INET{4,6} specific
 * @icsk_ca_state:     Congestion control state
 * @icsk_retransmits:     Number of unrecovered [RTO] timeouts
 * @icsk_pending:     Scheduled timer event
 * @icsk_backoff:     Backoff
 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
 * @icsk_probes_out:     unanswered 0 window probes
 * @icsk_ext_hdr_len:     Network protocol overhead (IP/IPv6 options)
 * @icsk_ack:       Delayed ACK control data
 * @icsk_mtup;       MTU probing control data
 */
struct inet_connection_sock {
  /* inet_sock has to be the first member! */
  struct inet_sock    icsk_inet;
  struct request_sock_queue icsk_accept_queue;
  struct inet_bind_bucket    *icsk_bind_hash;
  unsigned long      icsk_timeout;
   struct timer_list    icsk_retransmit_timer;
   struct timer_list    icsk_delack_timer;
  __u32        icsk_rto;
  __u32        icsk_pmtu_cookie;
  const struct tcp_congestion_ops *icsk_ca_ops;
  const struct inet_connection_sock_af_ops *icsk_af_ops;
  unsigned int      (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
  __u8        icsk_ca_state;
  __u8        icsk_retransmits;
  __u8        icsk_pending;
  __u8        icsk_backoff;
  __u8        icsk_syn_retries;
  __u8        icsk_probes_out;
  __u16        icsk_ext_hdr_len;
  struct {
    __u8      pending;   /* ACK is pending         */
    __u8      quick;   /* Scheduled number of quick acks     */
    __u8      pingpong;   /* The session is interactive       */
    __u8      blocked;   /* Delayed ACK was blocked by socket lock */
    __u32      ato;     /* Predicted tick of soft clock     */
    unsigned long    timeout;   /* Currently scheduled timeout       */
    __u32      lrcvtime;   /* timestamp of last received data packet */
    __u16      last_seg_size; /* Size of last incoming segment     */
    __u16      rcv_mss;   /* MSS used for delayed ACK decisions     */
  } icsk_ack;
  struct {
    int      enabled;
 
    /* Range of MTUs to search */
    int      search_high;
    int      search_low;
 
    /* Information on the current probe. */
    int      probe_size;
  } icsk_mtup;
  u32        icsk_ca_priv[16];
#define ICSK_CA_PRIV_SIZE  (16 * sizeof(u32))
};

在 inet_connection_sock结构体中第一个成员变量类型为struct inet_sock(include/net/inet_sock.h)

/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @daddr - Foreign IPv4 addr
 * @rcv_saddr - Bound local IPv4 addr
 * @dport - Destination port
 * @num - Local port
 * @saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @sport - Source port
 * @id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @is_icsk - is this an inet_connection_sock?
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
  /* sk and pinet6 has to be the first two members of inet_sock */
  struct sock    sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
  struct ipv6_pinfo  *pinet6;
#endif
  /* Socket demultiplex comparisons on incoming packets. */
  __be32      daddr;
  __be32      rcv_saddr;
  __be16      dport;
  __u16      num;
  __be32      saddr;
  __s16      uc_ttl;
  __u16      cmsg_flags;
  struct ip_options  *opt;
  __be16      sport;
  __u16      id;
  __u8      tos;
  __u8      mc_ttl;
  __u8      pmtudisc;
  __u8      recverr:1,
        is_icsk:1,
        freebind:1,
        hdrincl:1,
        mc_loop:1,
        transparent:1,
        mc_all:1;
  int      mc_index;
  __be32      mc_addr;
  struct ip_mc_socklist  *mc_list;
  struct {
    unsigned int    flags;
    unsigned int    fragsize;
    struct ip_options  *opt;
    struct dst_entry  *dst;
    int      length; /* Total length of all frames */
    __be32      addr;
    struct flowi    fl;
  } cork;
};

而inet_sock的第一个成员正是struct sock类型，所以sk_prot_alloc直接返回struct sock *类型指针是没有问题的，接下来执行inet_create中的353行用inet_sk通过sk获得inet指针的值，inet_sk函数其实就相当于强制类型转换，返回的就是sk的指针。
接下来程序就一路返回到__sock_create，接着再返回到sys_socket中。在sys_socket中调用了最后一个函数sock_map_fd(net/socket.c，将socket指针sock与一个已经打开的文件号关联起来返回给用户程序。

/*
 *  Obtains the first available file descriptor and sets it up for use.
 *
 *  These functions create file structures and maps them to fd space
 *  of the current process. On success it returns file descriptor
 *  and file struct implicitly stored in sock->file.
 *  Note that another thread may close file descriptor before we return
 *  from this function. We use the fact that now we do not refer
 *  to socket after mapping. If one day we will need it, this
 *  function will increment ref. count on file by 1.
 *
 *  In any case returned fd MAY BE not valid!
 *  This race condition is unavoidable
 *  with shared fd spaces, we cannot solve it inside kernel,
 *  but we take care of internal coherence yet.
 */
 
static int sock_alloc_fd(struct file **filep, int flags)
{
  int fd;
 
  fd = get_unused_fd_flags(flags);
  if (likely(fd >= 0)) {
    struct file *file = get_empty_filp();
 
    *filep = file;
    if (unlikely(!file)) {
      put_unused_fd(fd);
      return -ENFILE;
    }
  } else
    *filep = NULL;
  return fd;
}
 
static int sock_attach_fd(struct socket *sock, struct file *file, int flags)
{
  struct dentry *dentry;
  struct qstr name = { .name = "" };
 
  dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
  if (unlikely(!dentry))
    return -ENOMEM;
 
  dentry->d_op = &sockfs_dentry_operations;
  /*
   * We dont want to push this dentry into global dentry hash table.
   * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
   * This permits a working /proc/$pid/fd/XXX on sockets
   */
  dentry->d_flags &= ~DCACHE_UNHASHED;
  d_instantiate(dentry, SOCK_INODE(sock));
 
  sock->file = file;
  init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
      &socket_file_ops);
  SOCK_INODE(sock)->i_fop = &socket_file_ops;
  file->f_flags = O_RDWR | (flags & O_NONBLOCK);
  file->f_pos = 0;
  file->private_data = sock;
 
  return 0;
}
 
int sock_map_fd(struct socket *sock, int flags)
{
  struct file *newfile;
  int fd = sock_alloc_fd(&newfile, flags);
 
  if (likely(fd >= 0)) {
    int err = sock_attach_fd(sock, newfile, flags);
 
    if (unlikely(err < 0)) {
      put_filp(newfile);
      put_unused_fd(fd);
      return err;
    }
    fd_install(fd, newfile);
  }
  return fd;
}

fs/dcache.c

/* the caller must hold dcache_lock */
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
  if (inode)
    list_add(&dentry->d_alias, &inode->i_dentry);
  dentry->d_inode = inode;
  fsnotify_d_instantiate(dentry, inode);
}
 
/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
  BUG_ON(!list_empty(&entry->d_alias));
  spin_lock(&dcache_lock);
  __d_instantiate(entry, inode);
  spin_unlock(&dcache_lock);
  security_d_instantiate(entry, inode);
}

/net/socket.c

/*
 *  Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *  in the operation structures but are done directly via the socketcall() multiplexor.
 */
 
static const struct file_operations socket_file_ops = {
  .owner =  THIS_MODULE,
  .llseek =  no_llseek,
  .aio_read =  sock_aio_read,
  .aio_write =  sock_aio_write,
  .poll =    sock_poll,
  .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
  .compat_ioctl = compat_sock_ioctl,
#endif
  .mmap =    sock_mmap,
  .open =    sock_no_open,  /* special open code to disallow open via /proc */
  .release =  sock_close,
  .fasync =  sock_fasync,
  .sendpage =  sock_sendpage,
  .splice_write = generic_splice_sendpage,
  .splice_read =  sock_splice_read,
};

在sock_map_fd中先通过402行获得一个未用的已经打开的文件号以及file结构，然后通过405行调用sock_attach_fd将文件号与sock相关联起来，在sock_attach_fd中先通地375行从sockfs中分配一个dentry，其中sock_mnt就是在描述sockfs中提到的，d_instantiate的作用就是将dentry与socket的inode关联起来，然后388行又将sock->file与file关联起来。389～390行将socket文件上的操作初始化为socket_file_ops。这样，通过send/recv进入内核将调用inet_stream_ops中的函数，而通过read/write调用将调用socket_file_ops中的函数。然后反回至sys_socket函数中，再经过系统调用切换到用户态，socket函数的整个调用过程完成。

转自http://acm.hrbeu.edu.cn/~puppy/2011/02/28/linux-%E5%8D%8F%E8%AE%AE%E6%A0%88%E5%88%86%E6%9E%90-socket/

Linux

更多推荐