背景
Linux 网络–数据包的接收 一文中分析了 Linux 是如何接收一个数据包的,那么 Linux 在发送数据包的时候又是什么样的呢?本文聚焦于 Linux 数据包的发送,结合内核源码分析数据包的发送过程。
关于 Linux 网络数据包的接收可以阅读前文:
通过阅读本文应该能够知悉:
- Linux 是如何发送数据,从发出一个
send
系统调用到数据被发送出去需要经历什么样的过程; - 数据在发送的过程中会经过几次复制,每次复制的原因是什么;
- 内核发送数据的时候的 CPU 消耗在哪里;
- 发送数据时的
iptables
过滤规则工作在哪里。
本文基于内核 3.10 版本,网卡举例使用的是 igb 网卡。
send 系统调用
在发送数据的时候会用到 send
系统调用,send
系统调用的内部实际上调用的是 sendto
系统调用,在 sendto
中主要完成了这样几件事:
- 通过
fd
调用sockfd_lookup_light
将内核上中的fd
找出来。socket 对象上记录了各个协议的处理函数,这里可以参考下图,了解socket
对象的结构; - 在该函数中构造了一个
msghdr
对象,在这个对象中包含了发送数据的相关信息,比如发送数据的地址,发送数据的长度等; - 调用
sock_sendmsg
函数,将数据发送出去。
// net/socket.c
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/socket.c#L1753-L1804
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
unsigned int, flags, struct sockaddr __user *, addr,
int, addr_len)
{
struct socket *sock;
struct sockaddr_storage address;
int err;
struct msghdr msg;
struct iovec iov;
int fput_needed;
if (len > INT_MAX)
len = INT_MAX;
sock = sockfd_lookup_light(fd, &err, &fput_needed); // 通过 fd 找到 socket 对象
if (!sock)
goto out;
// 构造 msghdr 对象 msg
iov.iov_base = buff;
iov.iov_len = len;
msg.msg_name = NULL;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
if (addr) {
err = move_addr_to_kernel(addr, addr_len, &address);
if (err < 0)
goto out_put;
msg.msg_name = (struct sockaddr *)&address;
msg.msg_namelen = addr_len;
}
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
msg.msg_flags = flags;
err = sock_sendmsg(sock, &msg, len); // 调用 sock_sendmsg 函数发送数据
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
}
SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
unsigned int, flags)
{
return sys_sendto(fd, buff, len, flags, NULL, 0);
}
这里顺着 sock_sendmsg
往下看,相关的函数调用的链条是:sock_sendmsg
–> __sock_sendmsg
–> __sock_sendmsg_nosec
,中间的细节就不在此展开了,直接看最后的 __sock_sendmsg_nosec
函数。在这个函数的结尾,sock->ops->sendmsg(iocb, sock, msg, size)
;调用的是 socket
对象中的 sendmsg
函数,而根据上面 socket
对象的结构图,我们可以知道最后实际调用的是 inet_sendmsg
函数。
static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size)
{
struct sock_iocb *si = kiocb_to_siocb(iocb);
si->sock = sock;
si->scm = NULL;
si->msg = msg;
si->size = size;
return sock->ops->sendmsg(iocb, sock, msg, size);
}
传输层处理
上文提到在 send
的最后会找到 fd
对应的 socket
对象,内核会根据 socket
对象的协议族,对于 AF_INET
的协议族,会调用 inet_sendmsg
函数,函数的具体实现如下,sk->sk_prot->sendmsg
会调用到具体的协议的发送函数,对于 TCP
协议来说,最终会调用到 tcp_sendmsg
函数。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/ipv4/af_inet.c#L760-L773
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size)
{
struct sock *sk = sock->sk;
sock_rps_record_flow(sk);
/* We may need to bind the socket. */
if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
}
下面是 tcp_sendmsg
函数的具体实现。在 tcp_sendmsg
函数中,内核申请了 skb
内存,并将用户待发送的数据拷贝进去,最后将 skb
添加到发送队列中,需要注意的是,这里并不一定会立刻开始真正的发送,内核会根据一定的条件来判断是否需要发送数据,很有可能就直接返回了。
在这里的实现中,首先会通过 tcp_write_queue_tail
获取到发送队列的尾部,用户的发送队列就是一个 skb
链表。内核在此处会分配内存,并把用户空间内存中的数据拷贝到内核态内存中,因此这里会涉及到一次或者多次的内存拷贝。最后内核会在满足条件的时候将数据发送出去,只有在满足条件的时候内核才会真正的发送数据。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/ipv4/tcp.c#L1016-L1241
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
bool sg;
long timeo;
lock_sock(sk);
flags = msg->msg_flags;
if (flags & MSG_FASTOPEN) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
else if (err)
goto out_err;
offset = copied_syn;
}
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto do_error;
}
if (unlikely(tp->repair)) {
if (tp->repair_queue == TCP_RECV_QUEUE) {
copied = tcp_send_rcvq(sk, msg, size);
goto out;
}
err = -EINVAL;
if (tp->repair_queue == TCP_NO_QUEUE)
goto out_err;
/* 'common' sending to sendq */
}
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_send_mss(sk, &size_goal, flags);
/* Ok commence sending. */
iovlen = msg->msg_iovlen; // 用户空间数据长度
iov = msg->msg_iov; // 用户空间数据地址
copied = 0;
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
sg = !!(sk->sk_route_caps & NETIF_F_SG);
// 遍历用户空间数据
while (--iovlen >= 0) {
size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base; // 待发送数据的地址
iov++;
if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
if (offset >= seglen) {
offset -= seglen;
continue;
}
seglen -= offset;
from += offset;
offset = 0;
}
while (seglen > 0) {
int copy = 0;
int max = size_goal;
// 获取发送队列的尾部
skb = tcp_write_queue_tail(sk);
if (tcp_send_head(sk)) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb->len;
}
if (copy <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
// 申请一个新的 skb,并添加到发送队列尾部
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation);
if (!skb)
goto wait_for_memory;
/*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
skb->ip_summed = CHECKSUM_PARTIAL;
// 将新的 skb 添加到发送队列尾部
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
}
/* Try to append data to the end of skb. */
if (copy > seglen)
copy = seglen;
/* Where to copy to? */
if (skb_availroom(skb) > 0) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, from, copy); // 将数据拷贝到 skb 中
if (err)
goto do_fault;
} else {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
goto wait_for_memory;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
if (i == MAX_SKB_FRAGS || !sg) {
tcp_mark_push(tp, skb);
goto new_segment;
}
merge = false;
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
err = skb_copy_to_page_nocache(sk, from, skb,
pfrag->page,
pfrag->offset,
copy);
if (err)
goto do_error;
/* Update the skb. */
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
skb_fill_page_desc(skb, i, pfrag->page,
pfrag->offset, copy);
get_page(pfrag->page);
}
pfrag->offset += copy;
}
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
skb_shinfo(skb)->gso_segs = 0;
from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
// 发送判断,满足条件时内核会发送数据
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
release_sock(sk);
return copied + copied_syn;
do_fault:
if (!skb->len) {
tcp_unlink_write_queue(skb, sk);
/* It is the one place in all of TCP, except connection
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
sk_wmem_free_skb(sk, skb);
}
do_error:
if (copied + copied_syn)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
release_sock(sk);
return err;
}
EXPORT_SYMBOL(tcp_sendmsg);
当满足发送条件的时候,内核最终会调用 tcp_write_xmit
发送数据。在 tcp_write_xmit
函数中处理了传输层的拥塞控制,滑动窗口等,最后会调用 tcp_transmit_skb
开始真正的数据发送。
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
sent_pkts = 0;
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
if (!result) {
return false;
} else if (result > 0) {
sent_pkts = 1;
}
}
// 循环等待队列, 获取发送队列的头部
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
goto repair; /* Skip network transmission */
// 滑动窗口控制
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
else
break;
}
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
if (!push_one && tcp_tso_should_defer(sk, skb))
break;
}
/* TSQ : sk_wmem_alloc accounts skb truesize,
* including skb overhead. But thats OK.
*/
if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
break;
}
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
sk->sk_gso_max_segs));
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
// 调用 tcp_transmit_skb 发送数据
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
repair:
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
tcp_event_new_data_sent(sk, skb);
tcp_minshall_update(tp, mss_now, skb);
sent_pkts += tcp_skb_pcount(skb);
if (push_one)
break;
}
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk);
tcp_cwnd_validate(sk);
return false;
}
return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
}
这里再来看看具体的发送实现 tcp_transmit_skb
。
在具体发送的时候,会复制一个新的 skb 出来,这是因为 TCP 是支持丢失重传的,所以在发送的时候会复制一个新的 skb
出来,在收到对方的 ACK 之前 skb
是不能被删除的,需要等到收到 ACK
才能删除。注意此处又有数据复制操作。
除了复制出一个新的 skb
之外,还会对 skb
进行一些处理,此处就是设置 TCP 头部,计算校验和。这里需要注意个一个点是 skb
内部实际上是包含所有的协议头的,这样在设置的时候住需要将指移动到合适的位置,就能够设置 TCP 头部了。后面要介绍的设置 IP 的头部也是如此。通过这种方式能够避免内存的申请和拷贝,提高效率。
在 tcp_transmit_skb
函数的最后,会通过 icsk->icsk_af_ops->queue_xmit
将 skb
送往网络层。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/ipv4/tcp_output.c#L828-L957
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
/* If congestion control is doing timestamping, we must
* take such a timestamp before we potentially clone/copy.
*/
if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
__net_timestamp(skb);
// 克隆 skb
if (likely(clone_it)) {
const struct sk_buff *fclone = skb + 1;
if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
fclone->fclone == SKB_FCLONE_CLONE))
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask); // 克隆 skb
if (unlikely(!skb))
return -ENOBUFS;
}
inet = inet_sk(sk);
tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
else
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
/* if no packet is in qdisc/device queue, then allow XPS to select
* another queue.
*/
skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
skb->sk = sk;
skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
tcp_wfree : sock_wfree;
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
// 设置 tcp 头
/* Build TCP header and checksum it. */
th = tcp_hdr(skb);
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->tcp_flags);
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
th->window = htons(min(tp->rcv_wnd, 65535U));
} else {
th->window = htons(tcp_select_window(sk));
}
th->check = 0;
th->urg_ptr = 0;
/* The urg_mode check is necessary during a below snd_una win probe */
if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
if (before(tp->snd_up, tcb->seq + 0x10000)) {
th->urg_ptr = htons(tp->snd_up - tcb->seq);
th->urg = 1;
} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
th->urg_ptr = htons(0xFFFF);
th->urg = 1;
}
}
tcp_options_write((__be32 *)(th + 1), tp, &opts);
if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, NULL, skb);
}
#endif
icsk->icsk_af_ops->send_check(sk, skb);
if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size)
tcp_event_data_sent(tp, sk);
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
// 调用网络层的发送接口
err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
if (likely(err <= 0))
return err;
tcp_enter_cwr(sk, 1);
return net_xmit_eval(err);
}
网络层处理
网络层的入口函数是 ip_queue_xmit
。在 ip_queue_xmit
中,首先会检查 socket
中是否缓存了路由信息,如果没有的话,会通过路由表查找路由,并将其缓存在 socket
中。接着会为 skb
设置路由信息,然后构造 IP 头,设置 IP 头的一些信息,比如 TTL
,protocol
等。最后调用 ip_local_out
函数发送 skb
。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/ipv4/ip_output.c#L326-L413
int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
fl4 = &fl->u.ip4;
rt = skb_rtable(skb);
if (rt != NULL)
goto packet_routed;
/* Make sure we can route this packet. */
// 检查 socket 中是否缓存了路由信息
rt = (struct rtable *)__sk_dst_check(sk, 0);
if (rt == NULL) {
__be32 daddr;
/* Use correct destination address if we have options. */
daddr = inet->inet_daddr;
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
// 通过路由表查找路由, 并将其缓存在 socket 中
rt = ip_route_output_ports(sock_net(sk), fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
RT_CONN_FLAGS(sk),
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &rt->dst);
}
// 为 skb 设置路由信息
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb); // 设置 IP 头等信息
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
ip_copy_addrs(iph, fl4);
/* Transport layer set skb->h.foo itself. */
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
ip_select_ident_more(iph, &rt->dst, sk,
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
// 调用 ip_local_out 函数发送 skb
res = ip_local_out(skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
EXPORT_SYMBOL(ip_queue_xmit);
顺着 ip_local_out
可以看到下面的调用链为 ip_local_out
–> __ip_local_out
–> nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,skb_dst(skb)->dev, dst_output)
。这里看到数据是如何一步一步处理的,如果配置了 netfilter
的话,会调用 nf_hook
函数。这里就不探讨对应的细节了,直接看和发送相关的 dst_output
函数。
// include/net/dst.h
static inline int dst_output(struct sk_buff *skb)
{
return skb_dst(skb)->output(skb);
}
dst_output
函数找到 skb
的路由表,然后调用路由表的 output
函数。这里实际上又是一个函数指针,指向的是 ip_output
函数。 ip_output
中会获取目标设备,设置数据包的设备和协议,并更新 IP 输出的统计信息。最后调用 NF_HOOK_COND
函数,这个函数会调用 netfilter
的钩子函数,具体的 netfilter
的钩子函数这里同样就不展开了,直接看后面的 ip_finish_output
函数。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/ipv4/ip_output.c#L298-L310
int ip_output(struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
在 ip_finish_output
中可以看到如果数据包的长度大于 mtu
并且不是 GSO
包的话,会调用 ip_fragment
函数进行分片,否则调用 ip_finish_output2
函数。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/ipv4/ip_output.c#L222-L235
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm != NULL) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
return ip_fragment(skb, ip_finish_output2); // 分片
else
return ip_finish_output2(skb);
}
ip_finish_output2
中会进行邻居项的查找设置,然后将数据送往下一层。
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (skb2 == NULL) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
consume_skb(skb);
skb = skb2;
}
rcu_read_lock_bh();
// 根据下一跳地址查找邻居项,没有没有找到就创建一个
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
neigh = __ipv4_neigh_lookup_noref(dev, nexthop); // 没有找到就创建一个
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh)) {
int res = dst_neigh_output(dst, neigh, skb); // 调用 dst_neigh_output 函数向下层传递
rcu_read_unlock_bh();
return res;
}
rcu_read_unlock_bh();
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb(skb);
return -EINVAL;
}
邻居子系统处理
邻居子系统位于网络层和数据链路层的中间,其作用是为网络层提供下层的封装,让网络层不必关心下层的地址信息。让下层决定发送到哪个 MAC 地址。在邻居子系统中主要查找或者创建邻居项,其中创建邻居项的时候可能会发出实际的 ARP 请求。然后封装 MAC 头,将发送过程再传递到下层的网络设备子系统。
在 ip_finish_output2
中调用了 __ipv4_neigh_lookup_noref(dev, nexthop)
函数,这个函数会查找邻居项,如果没有找到则调用 __neigh_create
创建一个。但是有了邻居项之后,仍然不具备发送 IP 数据包能力,还需要获取到 MAC 地址。这里调用 dst_neigh_output
将 skb
继续向下层传递。这个函数的最后的 output
实际指向的是 neigh_resolve_output
。
// include/net/dst.h
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
struct sk_buff *skb)
{
const struct hh_cache *hh;
if (dst->pending_confirm) {
unsigned long now = jiffies;
dst->pending_confirm = 0;
/* avoid dirtying neighbour */
if (n->confirmed != now)
n->confirmed = now;
}
hh = &n->hh;
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
return neigh_hh_output(hh, skb);
else
return n->output(n, skb);
}
neigh_resolve_output
通过邻居条目解析目标 MAC 地址,然后生成并发送网络数据包。在 neigh_event_send
中可能会发出 ARP 请求。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/core/neighbour.c#L1284-L1320
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
int rc = 0;
if (!dst)
goto discard;
// 试图发送邻居事件(如 ARP 请求)
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
unsigned int seq;
if (dev->header_ops->cache && !neigh->hh.hh_len)
neigh_hh_init(neigh, dst);
// 硬件头部的生成
do {
__skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
rc = dev_queue_xmit(skb); // 调用 dev_queue_xmit 将数据包传递给网络设备子系统
else
goto out_kfree_skb;
}
out:
return rc;
discard:
neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh);
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
goto out;
}
网络设备子系统处理
数据包经过邻居子系统,通过调用 dev_queue_xmit
到达网络设备子系统。在 dev_queue_xmit
中,首先会调用 netdev_pick_tx
选择发送队列, 这里需要选择队列是因为网卡是有多个发送队列的。然后获取此队列关联的 qdisc
,最后调用 __dev_xmit_skb
函数发送数据。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/core/dev.c#L2778
int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
skb_reset_mac_header(skb);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
skb_update_prio(skb);
// 选择发送队列
txq = netdev_pick_tx(dev, skb);
q = rcu_dereference_bh(txq->qdisc); // 获取此队列相关的排队规则
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
#endif
trace_net_dev_queue(skb);
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
}
// 没有队列的是回环设备和隧道设备
...
out:
rcu_read_unlock_bh();
return rc;
}
EXPORT_SYMBOL(dev_queue_xmit);
在 __dev_xmit_skb
中包含正常的排队发送逻辑:调用 q->enqueue
将 skb
添加到发送队列中,然后调用 __qdisc_run
开始发送数据。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/core/dev.c#L2654-L2715
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
bool contended;
int rc;
qdisc_pkt_len_init(skb);
qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
* This permits __QDISC_STATE_RUNNING owner to get the lock more often
* and dequeue packets faster.
*/
contended = qdisc_is_running(q);
if (unlikely(contended))
spin_lock(&q->busylock);
spin_lock(root_lock);
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
kfree_skb(skb);
rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
qdisc_run_begin(q)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
skb_dst_force(skb);
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
} else
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
// 正常排队逻辑
// 将 skb 添加到发送队列中
skb_dst_force(skb);
rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
// 开始发送
__qdisc_run(q);
}
}
spin_unlock(root_lock);
if (unlikely(contended))
spin_unlock(&q->busylock);
return rc;
}
在 __qdisc_run
中不断将 skb
中取出,并发送。
这里需要注意的是,这个时候实际占用的用户进程的系统态时间 (sy),只有当 quota 用尽或者其他进程需要 CPU 的时候才会触发软中断发送。这是为什么查看 /proc/softirqs
时,会发现 NET_TX
的数量要比 NET_RX
的少的原因。
// net/sched/sch_generic.c
void __qdisc_run(struct Qdisc *q)
{
int quota = weight_p;
while (qdisc_restart(q)) {
/*
* 发生下面的情况,延后处理
* 1. quota 用尽
* 2. 其他进程需要 CPU
*/
if (--quota <= 0 || need_resched()) {
__netif_schedule(q);
break;
}
}
qdisc_run_end(q);
}
// net/sched/sch_generic.c
static inline int qdisc_restart(struct Qdisc *q)
{
struct netdev_queue *txq;
struct net_device *dev;
spinlock_t *root_lock;
struct sk_buff *skb;
/* Dequeue packet */
skb = dequeue_skb(q);
if (unlikely(!skb))
return 0;
WARN_ON_ONCE(skb_dst_is_noref(skb));
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
return sch_direct_xmit(skb, q, dev, txq, root_lock);
}
// net/sched/sch_generic.c
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock)
{
int ret = NETDEV_TX_BUSY;
/* And release qdisc */
spin_unlock(root_lock);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_stopped(txq))
ret = dev_hard_start_xmit(skb, dev, txq); // 调用驱动发送数据
HARD_TX_UNLOCK(dev, txq);
spin_lock(root_lock);
if (dev_xmit_complete(ret)) {
/* Driver sent out skb successfully or skb was consumed */
ret = qdisc_qlen(q);
} else if (ret == NETDEV_TX_LOCKED) {
/* Driver try lock failed */
ret = handle_dev_cpu_collision(skb, txq, q);
} else {
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely(ret != NETDEV_TX_BUSY))
net_warn_ratelimited("BUG %s code %d qlen %d\n",
dev->name, ret, q->q.qlen);
ret = dev_requeue_skb(skb, q);
}
if (ret && netif_xmit_frozen_or_stopped(txq))
ret = 0;
return ret;
}
软中断处理
在上一节提到在发送数据的时候,当 quota 用尽或者其他进程需要 CPU 的时候,会调用 __netif_schedule
触发软中断发送。这个函数到触发软中断的调用链为 __netif_schedule
–> __netif_schedule
–> __netif_reschedule
。在 __netif_reschedule
这个函数中能够获取到当前 CPU 的 softnet_data
,然后将 q
添加到 output_queue_tailp
中,最后调用 raise_softirq_irqoff(NET_TX_SOFTIRQ)
触发 NET_TX_SOFTIRQ
软中断。到这里以后发送数据消耗的 CPU 就显示在 si
上,不会消耗用户进程系统时间。
// net/core/dev.c
static inline void __netif_reschedule(struct Qdisc *q)
{
struct softnet_data *sd;
unsigned long flags;
local_irq_save(flags);
sd = &__get_cpu_var(softnet_data);
q->next_sched = NULL;
*sd->output_queue_tailp = q;
sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
再来看下软中断处理函数 net_tx_action
是如何处理的。在软中断中会获取到 softnet_data
, 在内核态调用 __netif_reschedule
的时候将发送队列写到了 softnet_data
的 output_queue
中了,因此软中断遍历 sd->output_queue
发送数据帧。qdisc_run
中调用的也是 __qdisc_run
, 之后的发送逻辑就和之前的一样了。
// https://github.com/torvalds/linux/blob/8bb495e3f02401ee6f76d1b1d77f3ac9f079e376/net/core/dev.c#L3214-L3270
static void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
if (sd->completion_queue) {
struct sk_buff *clist;
local_irq_disable();
clist = sd->completion_queue;
sd->completion_queue = NULL;
local_irq_enable();
while (clist) {
struct sk_buff *skb = clist;
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
trace_kfree_skb(skb, net_tx_action);
__kfree_skb(skb);
}
}
if (sd->output_queue) {
struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock;
head = head->next_sched;
root_lock = qdisc_lock(q);
if (spin_trylock(root_lock)) {
smp_mb__before_clear_bit();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
qdisc_run(q); // 发送数据帧
spin_unlock(root_lock);
} else {
if (!test_bit(__QDISC_STATE_DEACTIVATED,
&q->state)) {
__netif_reschedule(q);
} else {
smp_mb__before_clear_bit();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
}
}
}
}
}
网卡驱动发送
无论是用户进程的内核态还是软中断的上下文,最后都会调用 dev_hard_start_xmit
函数,这个函数中会调用驱动中的发送函数。
// net/core/dev.c
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
const struct net_device_ops *ops = dev->netdev_ops;
int rc = NETDEV_TX_OK;
unsigned int skb_len;
if (likely(!skb->next)) {
netdev_features_t features;
/*
* If device doesn't need skb->dst, release it right now while
* its hot in this cpu cache
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
features = netif_skb_features(skb);
if (vlan_tx_tag_present(skb) &&
!vlan_hw_offload_capable(features, skb->vlan_proto)) {
skb = __vlan_put_tag(skb, skb->vlan_proto,
vlan_tx_tag_get(skb));
if (unlikely(!skb))
goto out;
skb->vlan_tci = 0;
}
/* If encapsulation offload request, verify we are testing
* hardware encapsulation features instead of standard
* features for the netdev
*/
if (skb->encapsulation)
features &= dev->hw_enc_features;
if (netif_needs_gso(skb, features)) {
if (unlikely(dev_gso_segment(skb, features)))
goto out_kfree_skb;
if (skb->next)
goto gso;
} else {
if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not
* support checksumming for this protocol, complete
* checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (skb->encapsulation)
skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb));
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
if (!(features & NETIF_F_ALL_CSUM) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
}
if (!list_empty(&ptype_all))
dev_queue_xmit_nit(skb, dev);
skb_len = skb->len;
rc = ops->ndo_start_xmit(skb, dev); // 调用驱动的 ops 里的发送回调函数讲数据包传给网卡设备
trace_net_dev_xmit(skb, rc, dev, skb_len);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
return rc;
}
......
out_kfree_gso_skb:
if (likely(skb->next == NULL)) {
skb->destructor = DEV_GSO_CB(skb)->destructor;
consume_skb(skb);
return rc;
}
out_kfree_skb:
kfree_skb(skb);
out:
return rc;
}
驱动的发送函数是在网卡驱动初始化的时候就被赋值的,对于 igb 网卡设备,这个函数是 igb_xmit_frame
。在这个驱动函数中会将 skb 挂到 RingBuffer 上,驱动调用完毕,数据包真正的从网卡上发送出去。
RingBuffer 回收
当数据完成了发送之后,工作实际上并没有结束,因为此时内存还没有清理。因此当发送完成之后会触发一个硬中断来释放内存。在硬中断中会触发 NET_RX_SOFTIRQ
软中断。需要注意的点是这里触发的是接收的软中断 NET_RX_SOFTIRQ
,这也是为什么软中断统计 RX
要高于 TX
的原因。在软中断中会通过驱动的 poll
回调函数完成清理工作。
小结
本文从 send
系统调用开始分析了 Linux 网络数据包的发送原理,具体包括:send
系统调用的处理,各网络层的处理,网卡驱动设备在这个过程中是如何被使用的以及发送之后的清理逻辑。