net_device分析
<br />本文档的Copyleft归popy所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,<br />严禁用于任何商业用途。<br />gtalk: mypopy at gmail.com <br />来源:barrypopy.cublog.cn<br />1.不算短的一个引子<br />很早之前,就说要分析Linux网络部分的代码,可惜话是放出去了,却迟迟不能兑现,没什么
本文档的Copyleft归popy所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,
严禁用于任何商业用途。
gtalk: mypopy at gmail.com
来源:barrypopy.cublog.cn
1.不算短的一个引子
很早之前,就说要分析Linux网络部分的代码,可惜话是放出去了,却迟迟不能兑现,没什么信
用,今天没什么事情,看看Netlink的用法和实现,看看Linux下的QoS的实现,又看到了net_de
vice结构,想写些东西,也算是让最近躁动不安的心有个些许的平静,不让自己陷入一种不着
边际的情绪.
不想说很多,我们还是从Code开始,Linux说"Use the Source, Luke!",因此,我们还是从Sou
rce开始吧
[linux-2.6.23-11]
/* include/linux/netdevice.h */
/*
* The DEVICE structure.
* Actually, this whole structure is a big mistake. It mixes I/O
* data with strictly "high-level" data, and it has to know about
* almost every data structure used in the INET module.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
struct net_device
{
/*
* This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* the interface.
*/
char name[IFNAMSIZ];
/* device name hash chain */
struct hlist_node name_hlist;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
unsigned long mem_end; /* shared mem end */
unsigned long mem_start; /* shared mem start */
unsigned long base_addr; /* device I/O address */
unsigned int irq; /* device IRQ number */
/*
* Some hardware also needs these fields, but they are not
* part of the usual set specified in Space.c.
*/
unsigned char if_port; /* Selectable AUI, TP,..*/
unsigned char dma; /* DMA channel */
unsigned long state;
struct list_head dev_list;
/* The device initialization function. Called only once. */
int (*init)(struct net_device *dev);
/* ------- Fields preinitialized in Space.c finish here ------- */
/* Net device features */
unsigned long features;
#define NETIF_F_SG 1 /* Scatter/gather IO. */
#define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
#define NETIF_F_IPV6_CSUM 16 /* Can checksum TCP/UDP over IPV6 */
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX */
#define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
#define NETIF_F_GSO_MASK 0xffff0000
#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
struct net_device *next_sched;
/* Interface index. Unique device identifier */
int ifindex;
int iflink;
struct net_device_stats* (*get_stats)(struct net_device *dev);
struct net_device_stats stats;
#ifdef CONFIG_WIRELESS_EXT
/* List of functions to handle Wireless Extensions (instead of ioctl).
* See <net/iw_handler.h> for details. Jean II */
const struct iw_handler_def * wireless_handlers;
/* Instance data managed by the core of Wireless Extensions. */
struct iw_public_data * wireless_data;
#endif
const struct ethtool_ops *ethtool_ops;
/*
* This marks the end of the "visible" part of the structure. All
* fields hereafter are internal to the system, and may change at
* will (read: may be cleaned up at will).
*/
unsigned int flags; /* interface flags (a la BSD) */
unsigned short gflags;
unsigned short priv_flags; /* Like 'flags' but invisible to
userspace. */
unsigned short padded; /* How much padding added by alloc_netdev() */
unsigned char operstate; /* RFC2863 operstate */
unsigned char link_mode; /* mapping policy to operstate */
unsigned mtu; /* interface MTU value */
unsigned short type; /* interface hardware type */
unsigned short hard_header_len; /* hardware hdr length */
struct net_device *master; /* Pointer to master device of a group,
* which this device is member of.
*/
/* Interface address info. */
unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
unsigned char addr_len; /* hardware address length */
unsigned short dev_id; /* for shared network cards */
struct dev_addr_list *uc_list; /* Secondary unicast mac addresses */
int uc_count; /* Number of installed ucasts */
int uc_promisc;
struct dev_addr_list *mc_list; /* Multicast mac addresses */
int mc_count; /* Number of installed mcasts */
int promiscuity;
int allmulti;
/* Protocol specific pointers */
void *atalk_ptr; /* AppleTalk link */
void *ip_ptr; /* IPv4 specific data */
void *dn_ptr; /* DECnet specific data */
void *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
void *ax25_ptr; /* AX.25 specific data */
struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
assign before registering */
/*
* Cache line mostly used on receive path (including eth_type_trans())
*/
struct list_head poll_list ____cacheline_aligned_in_smp;
/* Link to poll list */
int (*poll) (struct net_device *dev, int *quota);
int quota;
int weight;
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
because most packets are unicast) */
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
/*
* Cache line mostly used on queue transmit path (qdisc)
*/
/* device queue lock */
spinlock_t queue_lock ____cacheline_aligned_in_smp;
struct Qdisc *qdisc;
struct Qdisc *qdisc_sleeping;
struct list_head qdisc_list;
unsigned long tx_queue_len; /* Max frames per queue allowed */
lly transmitted GSO packet. */
struct sk_buff *gso_skb;
/* ingress path synchronizer */
spinlock_t ingress_lock;
struct Qdisc *qdisc_ingress;
/*
* One part is mostly used on xmit path (device)
*/
/* hard_start_xmit synchronizer */
spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
/* cpu id of processor entered to hard_start_xmit or -1,
if nobody entered there.
*/
int xmit_lock_owner;
void *priv; /* pointer to private data */
int (*hard_start_xmit) (struct sk_buff *skb,
struct net_device *dev);
/* These may be needed for future network-power-down code. */
unsigned long trans_start; /* Time (in jiffies) of last Tx */
int watchdog_timeo; /* used by dev_watchdog() */
struct timer_list watchdog_timer;
/*
* refcnt is a very hot point, so align it on SMP
*/
/* Number of references to this device */
atomic_t refcnt ____cacheline_aligned_in_smp;
/* delayed register/unregister */
struct list_head todo_list;
/* device index hash chain */
struct hlist_node index_hlist;
struct net_device *link_watch_next;
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
} reg_state;
/* Called after device is detached from network. */
void (*uninit)(struct net_device *dev);
/* Called after last user reference disappears. */
void (*destructor)(struct net_device *dev);
/* Pointers to interface service routines. */
int (*open)(struct net_device *dev);
int (*stop)(struct net_device *dev);
#define HAVE_NETDEV_POLL
int (*hard_header) (struct sk_buff *skb,
struct net_device *dev,
unsigned short type,
void *daddr,
void *saddr,
unsigned len);
int (*rebuild_header)(struct sk_buff *skb);
#define HAVE_CHANGE_RX_FLAGS
void (*change_rx_flags)(struct net_device *dev,
int flags);
#define HAVE_SET_RX_MODE
void (*set_rx_mode)(struct net_device *dev);
#define HAVE_MULTICAST
void (*set_multicast_list)(struct net_device *dev);
#define HAVE_SET_MAC_ADDR
int (*set_mac_address)(struct net_device *dev,
void *addr);
#define HAVE_PRIVATE_IOCTL
int (*do_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd);
#define HAVE_SET_CONFIG
int (*set_config)(struct net_device *dev,
struct ifmap *map);
#define HAVE_HEADER_CACHE
int (*hard_header_cache)(struct neighbour *neigh,
struct hh_cache *hh);
void (*header_cache_update)(struct hh_cache *hh,
struct net_device *dev,
unsigned char * haddr);
#define HAVE_CHANGE_MTU
int (*change_mtu)(struct net_device *dev, int new_mtu);
#define HAVE_TX_TIMEOUT
void (*tx_timeout) (struct net_device *dev);
void (*vlan_rx_register)(struct net_device *dev,
struct vlan_group *grp);
void (*vlan_rx_add_vid)(struct net_device *dev,
unsigned short vid);
void (*vlan_rx_kill_vid)(struct net_device *dev,
unsigned short vid);
int (*hard_header_parse)(struct sk_buff *skb,
unsigned char *haddr);
int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);
#ifdef CONFIG_NETPOLL
struct netpoll_info *npinfo;
#endif
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*poll_controller)(struct net_device *dev);
#endif
/* bridge stuff */
struct net_bridge_port *br_port;
/* macvlan */
struct macvlan_port *macvlan_port;
/* class/net/name entry */
struct device dev;
/* space for optional statistics and wireless sysfs groups */
struct attribute_group *sysfs_groups[3];
/* rtnetlink link ops */
const struct rtnl_link_ops *rtnl_link_ops;
/* The TX queue control structures */
unsigned int egress_subqueue_count;
struct net_device_subqueue egress_subqueue[1];
};
结构很大,如果是在我现在的Codeing生涯,你定义这样一个结构给我用,我可能要骂娘了:(,
不过这是"高手"的作品,自然有它的道理(盲目崇拜了吧:(,我的错)
我只喜欢简单可靠的API,不想了解怎样实现这些API的,这也是一种懒惰吧.可是现在,我们
要转换一下思路了,如果是我们来提供API,该又如何呢?
不过看Code的注释,也知道有人对这么大一个结构体不满,不止是我一个,我们现在是慢慢分
析各个字段(filed)的作用,抛开对于大的偏见吧,别人都能写出来,为什么我们就不能看看
呢,是吧:)
2.我从哪里来?
如果对于Unix/Linux的一些基本哲学有所了解的话,一定知道"Everything is file"的思想
,很可惜,对于网络设备,是个例外.
网络设备很特殊,所以有了"net_device"!这就是我的由来,也许你会有更多疑问,不妨自己
去探究一番,虽然不喜欢朱熹老先生,但至少他还说了句实话--"纸上来得终觉浅,绝知此事
要躬行"
不想继续说这个问题,还是直接切入主题吧!
3.芸芸终生为那般
这么大的结构,我一时还真不知道从哪开始,我小学语文老师说,这叫"狗咬刺猬--无从下口"
我们简单说过来,点点滴滴.
我实在不太愿意整理这么多的字段,选择了最懒惰的方式,从头来过
name:
注释很清楚吧,如果你还有些疑惑,不妨ifconfig -a,输出的每个interface的名字就是
这个字段输出的结果.
/*
* This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* the interface.
*/
char name[IFNAMSIZ];
name_hlist:
只是为了连接各个网络设备,可能你只有一块网卡,但其它人可能不止一块哦,在加上一
些虚拟的网络设备,数量就更大了,这是以设备名字为HASH Key构成hash链表,其实对于net_
device的管理,用了3个数据结构,一个全局链表,两个HASH表,分别是这几个字段:
/* device name hash chain */
struct hlist_node name_hlist;
struct list_head dev_list;
/* device index hash chain */
struct hlist_node index_hlist;
看看字段名字就知道,全局链表就是个链表而已(dev_base_head),HASH分别以name,ifi
ndex为HASH的Key.
为了证明我不是胡言乱语,我决定还是拿出证据来,免得引起人品的置疑:)
/* net/core/dev.c */
/**
* register_netdevice - register a network device
* @dev: device to register
*
* Take a completed network device structure and add it to the kernel
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
* Callers must hold the rtnl semaphore. You may want
* register_netdev() instead of this.
*
* BUGS:
* The locking appears insufficient to guarantee two parallel registers
* will not get the same name.
*/
int register_netdevice(struct net_device *dev)
{
...
write_lock_bh(&dev_base_lock);
list_add_tail(&dev->dev_list, &dev_base_head);
hlist_add_head(&dev->name_hlist, head);
hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
dev_hold(dev);
write_unlock_bh(&dev_base_lock);
...
}
为什么用这么多的方式?
原因很简单,写Code的人很懒,不同的查询方式为了更方便自己的Code,毕竟世界丰富而
且有趣,不只是因为Linux:)
mem_end,mem_start,base_addr,irq,if_port,dma硬件相关字段:
我都不想理它:)
如果真的有兴趣,看LDD3吧,我对于硬件实在是新手中的新手,不要为难我:(
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
unsigned long mem_end; /
shared mem end */
unsigned long mem_start; /* shared mem start */
unsigned long base_addr; /* device I/O address */
unsigned int irq; /* device IRQ number */
/*
* Some hardware also needs these fields, but they are not
* part of the usual set specified in Space.c.
*/
unsigned char if_port; /* Selectable AUI, TP,..*/
unsigned char dma; /* DMA channel */
state:
unsigned long state;
如果你写Code比较多,就知道这种字段是为了描诉设备的信息,但我们要更进一步,描诉
的究竟是啥玩意?
ULNI中说"A set of flags used by the network queuing subsystem."
为了表示我看过该字段,我决定更完整点:)
其实这个字段的取值就在net_device结构定义的前面一些:
/* These flag bits are private to the generic network queueing
* layer, they may not be explicitly referenced by any other
* code.
*/
enum netdev_state_t
{
__LINK_STATE_XOFF=0,
__LINK_STATE_START,
__LINK_STATE_PRESENT,
__LINK_STATE_SCHED,
__LINK_STATE_NOCARRIER,
__LINK_STATE_RX_SCHED,
__LINK_STATE_LINKWATCH_PENDING,
__LINK_STATE_DORMANT,
__LINK_STATE_QDISC_RUNNING,
};
这几个枚举我都grep过一番,除了下面几个意义很明显以外,其它的我并不是很清楚:(
a).__LINK_STATE_XOFF
表示网络设备不能用来收包了,出现这种情形有两种情况,一是不能收包了,二则是网络
设备忙.(二则需要区分么??请看下文)
b).__LINK_STATE_START
表示网络设备被打开,注意,是"打开",不是说可以用来收包哦,原因是可能出现网络设
备被打开,但busy的情况,因此,要能发包收包,实际上需要设置__LINK_STATE_START,并且设
备不能出于BUSY状态
其它字段我就不是很清楚了:(
最后叨唠一句,该字段要用提供的API来操作,不要自己直接设值,这些API就在同一个文
件里面:
命名方式是:static inline void netif_xxx_xx(struct net_device *dev);
init:
LDD3中说,该函数指针很少用到,看来我们不需要关注它了:)
/* The device initialization function. Called only once. */
int (*init)(struct net_device *dev);
features:
这个字段实际上描述的是网络设备的硬件能力,好像只于驱动关系密切,研究网络实现
时候似乎可以不必追究,感兴趣的话看LDD3:)
/* Net device features */
unsigned long features;
#define NETIF_F_SG 1 /* Scatter/gather IO. */
#define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
#define NETIF_F_IPV6_CSUM 16 /* Can checksum TCP/UDP over IPV6 */
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets
*/
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX */
#define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
#define NETIF_F_GSO_MASK 0xffff0000
#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
ifindex,iflink:
每个网络设备,除了可以用name的方式表示以外,还可以用ifindex表示,当网络设备注
册时候调用dev_new_index()获取
iflink为什么会出现呢?其实我们说的网络设备,是个泛化的概念,除了你想到的网卡以
外,还有一些虚拟网络设备,比如隧道,它也有ifindex,但它最终要附着于一个实际的网络设
备,而iflink就是它附着的实际(物理)网络设备.
/* Interface index. Unique device identifier */
int ifindex;
4.一段与故事无关得历史?
这是一个叉开的话题,但我觉得很重要.
都知道sk_buff代表socket buffer,但是它来自何处,去向何方?
仔细想想就可以确定,肯定是网络设备去创建了它,网络设备又是如何得知如何创建呢?
我们慢慢看来.
不想去分析网卡驱动,但是可以简单说明一下过程.
1).物理网络设备接受到数据包并告知内核已经触发了一个中断[硬件中断].
2).该中断被内核处理,调用对应注册的中断处理函数,我们可以称之为:net_interrupt
_handler().
3).在网络设备驱动程序中,有两种方式处理收到网络包的方法,分别为:a).以前一直沿
用到现在的网卡普通中断处理方式,使用的是netif_rx()函数.b).NAPI方式.前面是一个纯
粹的中断处理方式,但是如果网络流量比较大,并且都是比较小的数据包的时候,每个包产生
一个中断有些奢侈,还不如直接轮询(poll),NAPI就是基于这种思想提出来的,对于驱动程序
来说,则是先关闭网罗设备中断,然后调用netif_rx_schedule().
###############################################################################
如果对于驱动如何收包感兴趣,推荐参考LDD3第17章,还有分析NAPI的一篇文章:
<NAPI 技术在 Linux 网络驱动上的应用和完善>
http://www.ibm.com/developerworks/cn/linux/l-napi/index.html
###############################################################################
4).自此,我们和驱动程序脱离关系,完全由软件来控制(软中断),由do_softirq()开始
调度软中断,对于数据包的接受,调用的是:net_rx_action().[软件中断]
5).向上层分发包(关键函数是netif_receive_skb()).
过程说得很简略,但是你要知道其实每步分析清楚都不算一个小的工作量,我们先从全
局的角度看过去,细节问题慢慢说来,一下子陷入细节会让人"云深不知处".
知道了这个大概,可以开始深究了:)
5.继续歧途?
我有一个很不好得习惯,喜欢一条路走到底,于是经常头破血流,头破血流就不说了,可
下次还是如此,MM说是固执,我辩解说,不会的,我怎么可能在一颗石头上摔死呢,不会有下次
的,可惜只是狡辩,下次依然在这块石头上摔跤:(
这就是生活,你以为你选择的捷径,然后按照选择继续前行,某天回头,才发现一路足迹
歪斜,跌跌撞撞,不过还好,到罗马不止365条路,选择自己的路,一路向前吧.
对于驱动的部分,我就不继续了,我不可能超越LDD3,因为我对于硬件,实在外行.
我们来看看驱动往后一点点的地方.----网络处理的软中断的实现:)
我们跳过了硬件中断,如果再跳过软中断的话,我们会错过很多精彩的地方.
每个人都知道,对于网络设备,有两个软中断,定义于include/linux/interrupt.h.
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ, /* 故事结束的地方:) */
NET_RX_SOFTIRQ, /* 故事开始的地方:) */
BLOCK_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
#ifdef CONFIG_HIGH_RES_TIMERS
HRTIMER_SOFTIRQ,
#endif
};
我们自然从"故事开始的地方"起步,在网络设备初始化的时候,已经埋下伏笔:
/* net/core/dev.c */
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
* present) and leaves us with a valid list of present and active devices.
*
*/
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
*/
static int __init net_dev_init(void)
{
......
/*
* Initialise the packet receive queues.
*/
for_each_possible_cpu(i) {
struct softnet_data *queue;
queue = &per_cpu(softnet_data, i);
skb_queue_head_init(&queue->input_pkt_queue);
queue->completion_queue = NULL;
INIT_LIST_HEAD(&queue->poll_list);
set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
queue->backlog_dev.weight = weight_p;
#######################################
[1].需要注意的地方,我们后面说:)
#######################################
queue->backlog_dev.poll = process_backlog;
atomic_set(&queue->backlog_dev.refcnt, 1);
}
netdev_dma_register();
dev_boot_phase = 0;
open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
......
}
省略掉的部分,在我以前的文章中已经分析过了,感兴趣的话,麻烦你去查找一下:)
struct softnet_data:
这个结构体我们初见,看着有些陌生,但还不至于不能理解.
/* include/linux/netdevice.h */
/*
* Incoming packets are placed on per-cpu queues so that
* no locking is needed.
*/
struct softnet_data
{
struct net_device *output_queue; /*
网络设备发送队列的队列头 */
struct sk_buff_head input_pkt_queue;/* 接受缓冲区的sk_buff队列 */
struct list_head poll_list; /* poll设备队列头 */
struct sk_buff *completion_queue;/* 完成发送等待释放的sk_buff队列 */
/* 当前参与poll操作的网络设备 */
struct net_device backlog_dev; /* Sorry. 8) */
#ifdef CONFIG_NET_DMA
struct dma_chan *net_dma;
#endif
};
其实结构struct softnet_data为每个CPU所私有,原因是当存在SMP时候,不需要互斥访
问.
我们来看net_rx_action():
/* */
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
int budget = netdev_budget;
void *have;
local_irq_disable();
while (!list_empty(&queue->poll_list)) {
struct net_device *dev;
if (budget <= 0 || jiffies - start_time > 1)
goto softnet_break;
local_irq_enable();
dev = list_entry(queue->poll_list.next,
struct net_device, poll_list);
have = netpoll_poll_lock(dev);
################################################
[2].dev->poll()需要注意
################################################
if (dev->quota <= 0 || dev->poll(dev, &budget)) {
netpoll_poll_unlock(have);
local_irq_disable();
list_move_tail(&dev->poll_list, &queue->poll_list);
if (dev->quota < 0)
dev->quota += dev->weight;
else
dev->quota = dev->weight;
} else {
netpoll_poll_unlock(have);
dev_put(dev);
local_irq_disable();
}
}
out:
local_irq_enable();
#ifdef CONFIG_NET_DMA
/*
* There may not be any more sk_buffs coming right now, so push
* any pending DMA copies to hardware
*/
if (!cpus_empty(net_dma.channel_mask)) {
int chan_idx;
for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
struct dma_chan *chan = net_dma.channels[chan_idx];
if (chan)
dma_async_memcpy_issue_pending(chan);
}
}
#endif
return;
softnet_break:
__get_cpu_var(netdev_rx_stat).time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}
显然这是在处理软中断,但是,谁去触发了软中断NET_RX_SOFTIRQ呢?
分别为函数:__netif_rx_schedule(),net_rx_action(),netif_rx_reschedule(),因此,以
后要注意Code中间的这几个函数哦:)
其实,如果驱动是采用原始的中断方式,触发软中断由netif_rx()执行(调用netif_rx_sched
ule),而采用NAPI方式的驱动则是自己在中断处理函数中调用netif_rx_schedule/__netif_
rx_schedule,这样,最终都能触发软中断,使得可以把数据包分发到上层.
前面我们留下两个疑惑[1],[2],现在我们可以解开了,其实在[1]处,是定义dev->poll()的
默认处理方法(即对于没有实现poll功能驱动程序的处理),而[2]处则是一个选择,对于有po
ll功能的网络驱动,采用驱动自己定义的poll()函数,否则采用默认的process_backlog().
典型的C实现的多态
故事就到这里了,后续我们将回归正途:)
待续......
参考:
1.LDD3,第17章
2.Understanding Linux Network Internals 第9,10章
3.The Linux? Networking Architecture: Design and Implementation of Network
Protocols in the Linux Kernel 第6章.
4.<NAPI 技术在 Linux 网络驱动上的应用和完善>
http://www.ibm.com/developerworks/cn/linux/l-napi/index.html
文章出处:飞诺网(www.firnow.com):http://dev.firnow.com/course/6_system/linux/Linuxjs/200868/123572_4.html
更多推荐
所有评论(0)