linux内核网络协议栈学习笔记(7)
本篇继续讨论IP包的收发ip_local_deliver:ip_local_deliver用来把数据包接收到本地,代码很短/* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb){ /* * R
本篇继续讨论IP包的收发
ip_local_deliver:
ip_local_deliver用来把数据包接收到本地,代码很短
/*
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
如果skb有分段需要重组,会调用到ip_defrag,IP的分段/重组留着后面去研究,现在假设分段已经重组完毕我们得到了一个大的skb报文,经过netfilter过滤之后,可能会进到ip_local_deliver_finish
ip_local_deliver_finish首先调用__skb_pull去掉IP头,如果有RAW socket,那么复制一份skb交给RAW socket处理,然后查找那个全局的net_protocol数组inet_protos,命中了skb->protocol对应的协议之后调用net_protocol->handler,e.g. 如是tcp那么调用tcp_v4_rcv
ip_forward:
if (skb->pkt_type != PACKET_HOST)
goto drop;
skb_forward_csum(skb);
如果报文的mac地址不是发往本机的,drop掉,调用skb_forward_csum把skb->ip_summed设置为CHECKSUM_NONE,因为转发需要修改ip头,进而重新计算ip校验和
/* We are about to mangle packet. Copy it! */
if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
goto drop;
iph = ip_hdr(skb);
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
递减TTL,为ip头预留空间
return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev,
ip_forward_finish);
调用ip_forward_finish完成转发,ip_forward_finish会调用dst_output把报文发送出去,上一篇我们已经提到了,__mkroute_input会对rtable赋值,如果是转发,那么rtable->u.dst.output = ip_output,这样dst_output实际上就调用了ip_output。整个代码段如下:
static int ip_forward_finish(struct sk_buff *skb)
{
struct ip_options * opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
if (unlikely(opt->optlen))
ip_forward_options(skb);
return dst_output(skb);
}
ip_output:
int ip_output(struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
可以看出ip_output核心就是调用ip_finish_output
ip_finish_output:
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm != NULL) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
return ip_fragment(skb, ip_finish_output2);
else
return ip_finish_output2(skb);
}
这里如果skb->len大于mtu,那么调用ip_fragment,该函数主要是做分段之后再次调用ip_finish_output2发送出去,后面会有讲解;如果长度合适那么直接调用ip_finish_output2
ip_finish_output2:
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (skb2 == NULL) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
kfree_skb(skb);
skb = skb2;
}
if (dst->hh)
return neigh_hh_output(dst->hh, skb);
else if (dst->neighbour)
return dst->neighbour->output(skb);
if (net_ratelimit())
printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
kfree_skb(skb);
return -EINVAL;
}
首先检查skb头部空间是否够放入一个etherheader,如果不够那么重新分配空间,之后调用neigbour子系统的发送函数,当然最后都是走到dev_xmit_queue里面
ip_queue_xmit:
packet_routed 标签前面的代码,都在为skb寻找一条合适的路由,如果没有的话,调用ip_route_output_flow找到一条,如果还是没有就把skb丢弃
如果skb有了路由,就开始构造ip头,主要代码如下:
packet_routed:
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->u.dst);
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
/* Transport layer set skb->h.foo itself. */
if (opt && opt->optlen) {
iph->ihl += opt->optlen >> 2;
ip_options_build(skb, opt, inet->daddr, rt, 0);
}
ip_select_ident_more(iph, &rt->u.dst, sk,
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
return ip_local_out(skb);
首先调用skb_push获取IP头的空间,如果有IP option也一起算进去,接着判断是否允许IP分段,并由此设置iph->frag_off标志,最后是设置ttl, protocol, src/dst address, priority, mark等。
ip_select_ident_more是为IP包找一个靠谱的ID,linux用了一个尽量不会重复的算法为每个IP包分配一个ID,为什么要这样呢?因为在高速网络中,如果IP重传率比较高的话,很快ID就会被用完,所以要尽量避免ID重复
最后调用ip_local_out把skb发出去
TCP还会调用ip_build_and_send_pkt,ip_send_reply发送包,前者只会用在发送synack的场景中(参考__tcp_v4_send_synack函数),后者用于发送ACK, RST报文
ip_append_data:
ip_append_data会被 ICMP, UDP, RAW socket 等多处调用,ip_append_data会进行分片的工作,从而减轻IP层的压力。每次ip_append_data都会向发送队列里的skb填充数据,直到数据长度为MTU为止,如果skb数据满了或这是第一个skb(此时L4 head长度为0),则新建一个skb出来。如此反复直到所有数据被拷贝到skb为止。
int ip_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
int err;
if (flags&MSG_PROBE)
return 0;
if (skb_queue_empty(&sk->sk_write_queue)) {
err = ip_setup_cork(sk, (struct inet_cork *)&inet->cork, ipc,
rtp);
if (err)
return err;
} else {
transhdrlen = 0;
}
return __ip_append_data(sk, &sk->sk_write_queue,
(struct inet_cork *)&inet->cork, getfrag,
from, length, transhdrlen, ipc, flags);
}
如果skb_queue_empty为真,表示sock->sk_write_queue发送队列里没有skb,这时候表示这是第一个skb,初始化好inet->cork这个结构体,否则表示这不是第一个skb了,那么很明显L4头已经在第一个skb里了,所以这里设置 transhdrlen = 0,表示这次塞进来的数据中,传输层头部长度为0
static int __ip_append_data(struct sock *sk,
struct flowi4 *fl4,
struct sk_buff_head *queue,
struct inet_cork *cork,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
int mtu;
int copy;
int err;
int offset = 0;
unsigned int maxfraglen, fragheaderlen;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
mtu = cork->fragsize;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (cork->length + length > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
mtu-exthdrlen);
return -EMSGSIZE;
}
调用skb_peek_tail从队列尾部获得skb指针,当然如果队列为空那么skb为NULL。从cork里获得MTU=cork->fragsize。获得L2头部长度hh_len。fragheaderlen表示L3头部长度,可以看到这是IP头部长度+option长度之和。maxfraglen是一个L3 payload跟8字节对齐(多出的部分被算作fragoff,后面会提到)后+L3头部的长度,和MTU长度差别在8个字节之内。
/*
* transhdrlen > 0 means that this is the first fragment and we wish
* it won't be fragmented in the future.
*/
if (transhdrlen &&
length + fragheaderlen <= mtu &&
rt->dst.dev->features & NETIF_F_V4_CSUM &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
这里设置 CHECKSUM_PARTIAL,表示由硬件执行校验和。由于包含了L4头部,我们希望这个skb不要被分段,同时如果该skb是UDP报文同时网卡又支持UDP offload同时报文长度又大于MTU,那么走ip_ufo_append_data的分支
cork->length += length;
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
maxfraglen, flags);
if (err)
goto error;
return 0;
}
如果塞进来的数据长度已经大于了mtu,那么如果是UDP协议,会尝试通过UFO进行offload,ip_ufo_append_data函数后面会介绍
if (!skb)
goto alloc_new_skb;
如果skb为空,新建一个skb,这时候是第一批数据被塞进来,否则已经有skb存在在队列里了
while (length > 0) {
/* Check if the remaining data fits into current packet. */
copy = mtu - skb->len;
copy计算出这个skb还可以拷贝多少数据
if (copy < length)
copy = maxfraglen - skb->len;
此时length超过了可以拷贝的数据,那么循环的这一轮只拷贝maxfraglen - skb->len的数据,由于这个skb的头部长度已经被计算过并包含在了skb->len中了,我们这时不需要重新计算
if (copy <= 0) {
这个场景比较巧,copy为0表示这个skb正好已经满了,copy < 0表示塞进来的数据是基于MTU长度的,但是由于8字节对齐,maxfraglen要稍微小于MTU一点,这部分因为对齐被截断的几个字节会放到下一个skb中去
char *data;
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
if (skb_prev)
fraggap = skb_prev->len - maxfraglen;
else
fraggap = 0;
这里计算被截掉的字节数
/*
* If remaining data exceeds the mtu,
* we know we need more fragment(s).
*/
datalen = length + fraggap;
datalen表示这次要塞的数据量
if (datalen > mtu - fragheaderlen)
datalen = maxfraglen - fragheaderlen;
由于要新建一个skb,这时需要把L3头部计算在内,因此datalen最多只能是maxfraglen - fragheaderlen
fraglen = datalen + fragheaderlen;
fraglen表示这个skb分片的长度
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
alloclen = fraglen;
如果后续还有数据同时有不支持scatter&gather,则按照MTU分配空间,否则按照数据长度分配
alloclen += exthdrlen;
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
* because we have no idea what fragment will be
* the last.
*/
if (datalen == length + fraggap)
alloclen += rt->dst.trailer_len;
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len + 15,
(flags & MSG_DONTWAIT), &err);
调用sock_alloc_send_skb分配skb,长度是alloclen + hh_len + 15,这个未必是线性空间,如果支持scatter&gather,会是好几个分散的page
} else {
skb = NULL;
if (atomic_read(&sk->sk_wmem_alloc) <=
2 * sk->sk_sndbuf)
skb = sock_wmalloc(sk,
alloclen + hh_len + 15, 1,
sk->sk_allocation);
这里的分配实际上调用了alloc_skb,似乎分配的是线性空间
if (unlikely(skb == NULL))
err = -ENOBUFS;
else
/* only the initial fragment is
time stamped */
cork->tx_flags = 0;
}
if (skb == NULL)
goto error;
/*
* Fill in the control structures
*/
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
skb_shinfo(skb)->tx_flags = cork->tx_flags;
/*
* Find where to start putting bytes.
*/
data = skb_put(skb, fraglen + exthdrlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
data += fragheaderlen + exthdrlen;
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
data + transhdrlen, fraggap, 0);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
由于数据被截断,需要重新计算上一个skb的校验和
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
}
copy = datalen - transhdrlen - fraggap;
计算真正要拷贝的数据长度,由于是第一个skb,需要把L4头也计算进去
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
}
拷贝数据完毕
offset += copy;
length -= datalen - fraggap;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
/*
* Put the packet on the pending queue.
*/
__skb_queue_tail(queue, skb);
continue;
}
下面是剩余的空间完全可以容纳新塞进来的数据的场景
if (copy > length)
copy = length;
if (!(rt->dst.dev->features&NETIF_F_SG)) {
unsigned int off;
off = skb->len;
if (getfrag(from, skb_put(skb, copy),
offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
}
如果不支持scatter&gather,那么直接拷贝到线性空间
} else {
int i = skb_shinfo(skb)->nr_frags;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
frag指向最后一个page
struct page *page = cork->page;
发送缓冲区使用的page
int off = cork->off;
unsigned int left;
if (page && (left = PAGE_SIZE - off) > 0) {
如果这个page还有空闲空间,先用着再说
if (copy >= left)
copy = left;
if (page != skb_frag_page(frag)) {
如果这个page不是scatter&gather使用的最后一个page,加到scatter&gather数组中,skb_shinfo(skb)->frags
if (i == MAX_SKB_FRAGS) {
err = -EMSGSIZE;
goto error;
}
skb_fill_page_desc(skb, i, page, off, 0);
skb_frag_ref(skb, i);
frag = &skb_shinfo(skb)->frags[i];
}
} else if (i < MAX_SKB_FRAGS) {
如果页没有空闲空间,调用alloc_pages新建一个页,并加入到frags数组中
if (copy > PAGE_SIZE)
copy = PAGE_SIZE;
page = alloc_pages(sk->sk_allocation, 0);
if (page == NULL) {
err = -ENOMEM;
goto error;
}
cork->page = page;
cork->off = 0;
skb_fill_page_desc(skb, i, page, 0, 0);
frag = &skb_shinfo(skb)->frags[i];
} else {
err = -EMSGSIZE;
goto error;
}
if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
offset, copy, skb->len, skb) < 0) {
err = -EFAULT;
goto error;
}
调用getfrag拷贝数据
cork->off += copy;
skb_frag_size_add(frag, copy);
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
atomic_add(copy, &sk->sk_wmem_alloc);
}
offset += copy;
length -= copy;
}
return 0;
error:
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
return err;
}
最后来看下ip_ufo_append_data
static inline int ip_ufo_append_data(struct sock *sk,
struct sk_buff_head *queue,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
int transhdrlen, int maxfraglen, unsigned int flags)
{
struct sk_buff *skb;
int err;
/* There is support for UDP fragmentation offload by network
* device, so create one single skb packet containing complete
* udp datagram
*/
if ((skb = skb_peek_tail(queue)) == NULL) {
如果队列为空,那么新建一个skb,否则加到原有的skb里,这个逻辑和非ufo的情况基本一致
skb = sock_alloc_send_skb(sk,
hh_len + fragheaderlen + transhdrlen + 20,
(flags & MSG_DONTWAIT), &err);
sock_alloc_send_skb调用sock_alloc_send_pskb,后者创建一个skb结构,其中需要传入header_len, data_len两个参数,其中header_len为L2L3L4层的头部长度,存在skb线性空间里,data_len为payload,会按照这个大小计算出需要的page数,之后一个个page分配,并按照scatter/gather的模式插入到skb_shared_info的frags数组中
if (skb == NULL)
return err;
/* reserve space for Hardware header */
skb_reserve(skb, hh_len);
增加skb->data, skb->tail,预留出hh_len的空间,相当于从skb->data到skb->tail的线性空间地址往下挪一段。注意在skb的定义中,skb->head, skb->data是虚拟地址,位数和CPU相关。skb->end, skb->tail只是相对于skb->head的偏移量,一般都是32位,这么做可以减少skb的大小
/* create space for UDP/IP header */
skb_put(skb, fragheaderlen + transhdrlen);
skb->tail, skb->len增加fragheaderlen+transhdrlen的长度
/* initialize network header pointer */
skb_reset_network_header(skb);
我们会看到很多这样的skb_set_xxx_header/skb_reset_xxx_header/skb_xxx_header函数,这里的xxx可以是mac/network/transport等,首先明确一点skb->network_header, skb->mac_header, skb->transport_header都表示报文头的地址
skb_set_xxx_header设置skb->xxx_header的值,skb_reset_xxx_header把skb->xxx_header指向skb->data
/* initialize protocol header pointer */
skb->transport_header = skb->network_header + fragheaderlen;
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
/* specify the length of each IP datagram fragment */
skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
__skb_queue_tail(queue, skb);
}
return skb_append_datato_frags(sk, skb, getfrag, from, (length - transhdrlen))
}
int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
int (*getfrag)(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length)
{
int frg_cnt = 0;
skb_frag_t *frag = NULL;
struct page *page = NULL;
int copy, left;
int offset = 0;
int ret;
do {
/* Return error if we don't have space for new frag */
frg_cnt = skb_shinfo(skb)->nr_frags;
if (frg_cnt >= MAX_SKB_FRAGS)
return -EFAULT;
/* allocate a new page for next frag */
page = alloc_pages(sk->sk_allocation, 0);
/* If alloc_page fails just return failure and caller will
* free previous allocated pages by doing kfree_skb()
*/
if (page == NULL)
return -ENOMEM;
/* initialize the next frag */
skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
skb->truesize += PAGE_SIZE;
atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
(length - transhdrlen));
/* get the new initialized frag */
frg_cnt = skb_shinfo(skb)->nr_frags;
frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
/* copy the user data to page */
left = PAGE_SIZE - frag->page_offset;
copy = (length > left)? left : length;
ret = getfrag(from, (page_address(frag->page) +
frag->page_offset + frag->size),
offset, copy, 0, skb);
if (ret < 0)
return -EFAULT;
/* copy was successful so update the size parameters */
frag->size += copy;
skb->len += copy;
skb->data_len += copy;
offset += copy;
length -= copy;
} while (length > 0);
return 0;
}
这段代码和之前的基本没区别,大致就是把数据从from拷贝到这个skb里,每次拷贝最多拷贝一个page大小的数据,如果最后一个page还有空间,那么先把这个page填满,否则新建一个page加到frags数组里
ip_append_data之后,会调用ip_push_pending_frames把包发出去
ip_push_pending_frames首先调用ip_finish_skb,该函数只是调用了__ip_make_skb。之前ip_append_data时,并没有填充ip头,这里要做这件事情,同时把队列里的所有skb合并到一个skb结构里,放哪里呢?当然时skb_shared_info->frag_list里咯
if ((skb = __skb_dequeue(queue)) == NULL)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
如果队列为空,那么无事可做
/* move skb->data to ip header from ext header */
if (skb->data < skb_network_header(skb))
__skb_pull(skb, skb_network_offset(skb));
while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
__skb_pull(tmp_skb, skb_network_header_len(skb));
*tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next);
skb->len += tmp_skb->len;
skb->data_len += tmp_skb->len;
skb->truesize += tmp_skb->truesize;
tmp_skb->destructor = NULL;
tmp_skb->sk = NULL;
}
顺序把队列里的每个skb加到frag_list里面
if (cork->flags & IPCORK_OPT)
opt = cork->opt;
if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl;
else
ttl = ip_select_ttl(inet, &rt->u.dst);
iph = (struct iphdr *)skb->data;
iph->version = 4;
iph->ihl = 5;
if (opt) {
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt, cork->addr, rt, 0);
}
iph->tos = inet->tos;
iph->frag_off = df;
ip_select_ident(iph, &rt->u.dst, sk);
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
构造skb的ip头部
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
* on dst refcount
*/
cork->dst = NULL;
skb_dst_set(skb, &rt->u.dst);
if (iph->protocol == IPPROTO_ICMP)
icmp_out_count(net, ((struct icmphdr *)
skb_transport_header(skb))->type);
/* Netfilter gets whole the not fragmented skb. */
ip_cork_release(cork);
out:
return skb;
ip_push_pending_frames之后调用ip_send_skb,该函数实际上最终调用里__ip_local_out,
int __ip_local_out(struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
dst_output);
}
dst_output会调用neight->output最终调用dev_xmit_queue把包发出去
更多推荐
所有评论(0)