使用PACKET_MMAP和PACKET_TX_RING发送数据比“正常”慢（无）

我正在使用PACKET_MMAP套接字选项创建一个环形缓冲区以通过原始套接字发送数据，从而在C中写入流量生成器。环形缓冲区填充以太网帧发送并调用sendto。环形缓冲区的全部内容通过套接字发送，这应比内存中的缓冲区具有更高的性能，并且需要为需要发送的缓冲区中的每个帧重复调用sendto。使用PACKET_MMAP和PACKET_TX_RING发送数据比“正常”慢（无）

当不使用PACKET_MMAP时，在调用sendto时，将一个帧从用户地址存储器中的缓冲区复制到内核内存中的SK buf，然后内核必须将数据包复制到NIC访问的存储器中，向NIC发信号以将该帧DMA存入其自己的硬件缓冲区并将其排队以供传输。当使用PACKET_MMAP套接字选项时，映射的内存由应用程序分配并链接到原始套接字。应用程序将数据包放入mmap缓冲区，调用sendto，而不是内核必须将数据包复制到SK buf中，它可以直接从mmapped缓冲区读取它们。也可以从环形缓冲区中读取分组的“块”，而不是单独的分组/帧。因此，性能增加是一个系统调用，可以为每个帧复制多个帧和一个较少的复制操作，以将它们放入NIC硬件缓冲区。

当我使用PACKET_MMAP将套接字的性能与“普通”套接字（包含单个数据包的字符缓冲区）进行比较时，根本没有性能优势。为什么是这样？在Tx模式下使用PACKET_MMAP时，每个环块只能放入一个帧（而不是每个环块的多个帧，就像Rx模式一样），但是我创建了256个块，所以我们应该在一个sendto呼叫权限中发送256帧？

性能与PACKET_MMAP，main()电话packet_tx_mmap()：

[email protected]:~/C/etherate10+$ sudo taskset -c 1 ./etherate_mt -I 1 
Using inteface lo (1) 
Running in Tx mode 
1. Rx Gbps 0.00 (0) pps 0 Tx Gbps 17.65 (2206128128) pps 1457152 
2. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.08 (2385579520) pps 1575680 
3. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.28 (2409609728) pps 1591552 
4. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.31 (2414260736) pps 1594624 
5. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.30 (2411935232) pps 1593088

性能，而不PACKET_MMAP，main()电话packet_tx()：

[email protected]:~/C/etherate10+$ sudo taskset -c 1 ./etherate_mt -I 1 
Using inteface lo (1) 
Running in Tx mode 
1. Rx Gbps 0.00 (0) pps 0 Tx Gbps 18.44 (2305001412) pps 1522458 
2. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.30 (2537520018) pps 1676037 
3. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.29 (2535744096) pps 1674864 
4. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.26 (2533014354) pps 1673061 
5. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.32 (2539476106) pps 1677329

的packet_tx()功能略高于packet_tx_mmap()功能更快似乎同时又是稍微短一点，所以我认为最小的性能增加只是0123中存在的略少的代码行。所以在我看来，这两个功能都具有几乎相同的性能，为什么？为什么不是PACKET_MMAP更快，据我所知，应该有更少的系统调用和副本？

void *packet_tx_mmap(void* thd_opt_p) { 

    struct thd_opt *thd_opt = thd_opt_p; 
    int32_t sock_fd = setup_socket_mmap(thd_opt_p); 
    if (sock_fd == EXIT_FAILURE) exit(EXIT_FAILURE); 

    struct tpacket2_hdr *hdr; 
    uint8_t *data; 
    int32_t send_ret = 0; 
    uint16_t i; 

    while(1) { 

     for (i = 0; i < thd_opt->tpacket_req.tp_frame_nr; i += 1) { 

      hdr = (void*)(thd_opt->mmap_buf + (thd_opt->tpacket_req.tp_frame_size * i)); 
      data = (uint8_t*)(hdr + TPACKET_ALIGN(TPACKET2_HDRLEN)); 

      memcpy(data, thd_opt->tx_buffer, thd_opt->frame_size); 
      hdr->tp_len = thd_opt->frame_size; 
      hdr->tp_status = TP_STATUS_SEND_REQUEST; 

     } 

     send_ret = sendto(sock_fd, NULL, 0, 0, NULL, 0); 
     if (send_ret == -1) { 
      perror("sendto error"); 
      exit(EXIT_FAILURE); 
     } 

     thd_opt->tx_pkts += thd_opt->tpacket_req.tp_frame_nr; 
     thd_opt->tx_bytes += send_ret; 

    } 

    return NULL; 

}

注意，下面的函数调用setup_socket()而不是setup_socket_mmap()：

void *packet_tx(void* thd_opt_p) { 

    struct thd_opt *thd_opt = thd_opt_p; 

    int32_t sock_fd = setup_socket(thd_opt_p); 

    if (sock_fd == EXIT_FAILURE) { 
     printf("Can't create socket!\n"); 
     exit(EXIT_FAILURE); 
    } 

    while(1) { 

     thd_opt->tx_bytes += sendto(sock_fd, thd_opt->tx_buffer, 
            thd_opt->frame_size, 0, 
            (struct sockaddr*)&thd_opt->bind_addr, 
            sizeof(thd_opt->bind_addr)); 
     thd_opt->tx_pkts += 1; 

    } 

}

在插座的设置功能的唯一区别是下面粘贴，但本质上它要求建立一个SOCKET_RX_RING或SOCKET_TX_RING：

// Set the TPACKET version, v2 for Tx and v3 for Rx 
// (v2 supports packet level send(), v3 supports block level read()) 
int32_t sock_pkt_ver = -1; 

if(thd_opt->sk_mode == SKT_TX) { 
    static const int32_t sock_ver = TPACKET_V2; 
    sock_pkt_ver = setsockopt(sock_fd, SOL_PACKET, PACKET_VERSION, &sock_ver, sizeof(sock_ver)); 
} else { 
    static const int32_t sock_ver = TPACKET_V3; 
    sock_pkt_ver = setsockopt(sock_fd, SOL_PACKET, PACKET_VERSION, &sock_ver, sizeof(sock_ver)); 
} 

if (sock_pkt_ver < 0) { 
    perror("Can't set socket packet version"); 
    return EXIT_FAILURE; 
} 


memset(&thd_opt->tpacket_req, 0, sizeof(struct tpacket_req)); 
memset(&thd_opt->tpacket_req3, 0, sizeof(struct tpacket_req3)); 

//thd_opt->block_sz = 4096; // These are set else where 
//thd_opt->block_nr = 256; 
//thd_opt->block_frame_sz = 4096; 

int32_t sock_mmap_ring = -1; 
if (thd_opt->sk_mode == SKT_TX) { 

    thd_opt->tpacket_req.tp_block_size = thd_opt->block_sz; 
    thd_opt->tpacket_req.tp_frame_size = thd_opt->block_sz; 
    thd_opt->tpacket_req.tp_block_nr = thd_opt->block_nr; 
    // Allocate per-frame blocks in Tx mode (TPACKET_V2) 
    thd_opt->tpacket_req.tp_frame_nr = thd_opt->block_nr; 

    sock_mmap_ring = setsockopt(sock_fd, SOL_PACKET , PACKET_TX_RING , (void*)&thd_opt->tpacket_req , sizeof(struct tpacket_req)); 

} else { 

    thd_opt->tpacket_req3.tp_block_size = thd_opt->block_sz; 
    thd_opt->tpacket_req3.tp_frame_size = thd_opt->block_frame_sz; 
    thd_opt->tpacket_req3.tp_block_nr = thd_opt->block_nr; 
    thd_opt->tpacket_req3.tp_frame_nr = (thd_opt->block_sz * thd_opt->block_nr)/thd_opt->block_frame_sz; 
    thd_opt->tpacket_req3.tp_retire_blk_tov = 1; 
    thd_opt->tpacket_req3.tp_feature_req_word = 0; 

    sock_mmap_ring = setsockopt(sock_fd, SOL_PACKET , PACKET_RX_RING , (void*)&thd_opt->tpacket_req3 , sizeof(thd_opt->tpacket_req3)); 
} 

if (sock_mmap_ring == -1) { 
    perror("Can't enable Tx/Rx ring for socket"); 
    return EXIT_FAILURE; 
} 


thd_opt->mmap_buf = NULL; 
thd_opt->rd = NULL; 

if (thd_opt->sk_mode == SKT_TX) { 

    thd_opt->mmap_buf = mmap(NULL, (thd_opt->block_sz * thd_opt->block_nr), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock_fd, 0); 

    if (thd_opt->mmap_buf == MAP_FAILED) { 
     perror("mmap failed"); 
     return EXIT_FAILURE; 
    } 


} else { 

    thd_opt->mmap_buf = mmap(NULL, (thd_opt->block_sz * thd_opt->block_nr), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock_fd, 0); 

    if (thd_opt->mmap_buf == MAP_FAILED) { 
     perror("mmap failed"); 
     return EXIT_FAILURE; 
    } 

    // Per bock rings in Rx mode (TPACKET_V3) 
    thd_opt->rd = (struct iovec*)calloc(thd_opt->tpacket_req3.tp_block_nr * sizeof(struct iovec), 1); 

    for (uint16_t i = 0; i < thd_opt->tpacket_req3.tp_block_nr; ++i) { 
     thd_opt->rd[i].iov_base = thd_opt->mmap_buf + (i * thd_opt->tpacket_req3.tp_block_size); 
     thd_opt->rd[i].iov_len = thd_opt->tpacket_req3.tp_block_size; 
    } 


}

更新1：结果针对物理接口（一个或多个） 有人提到，使用PACKET_MMAP时可能没有看到性能差异的原因之一是我将流量发送到回送接口（其中，一方面，没有QDISC）。由于运行packet_tx_mmap()或packet_tx()例程中的任何一个都可以生成超过10Gbps的数据，并且我只有10Gbps接口可供我使用。我将两个接口绑定在一起，这些结果与上面显示的几乎相同，但两者之间的速度差异很小两个功能：

packet_tx()到20G bond0

1线程：平均10.77Gbps〜/ 889kfps〜
2个线程：平均19.19Gbps〜/ 1.58Mfps〜
3线程：平均19.67Gbps 〜/ 1.62Mfps〜（这是快的键将去）

packet_tx_mmap()到20G bond0：

1线程：平均11.08Gbps〜/ 913kfps〜
2个线程：平均19.0Gbps〜/ 1.57Mfps〜
3线程：平均19.66Gbps〜/ 1.62Mfps〜（这是如快的键将去）

这与FR大小为1514字节（与上面的原始环回测试保持一致）。

在所有上述测试中，软IRQ的数量大致相同（使用this script测量）。在一个线程运行packet_tx()的情况下，CPU内核每秒钟有大约40k个中断。有2个和3个线程分别在2个和3个核心上运行40K。使用packet_tx_mmap()时的结果在哪里相同。大约40k个软核IRQ用于一个CPU内核上的单线程。运行2个和3个线程时每个核心为40k。

更新2：完整的源代码

我现在已经上传了完整的源代码，我还在写这个应用程序，它可能有很多缺陷，但他们是这个问题的范围：https://github.com/jwbensley/EtherateMT

来源

2017-04-03 jwbensley

你的网络有多快？你的尺寸有多大？你可能只是让你的链接饱和？你有没有检查实际（自动协商）比特率？ – maxy

帧的大小是1514个八位字节的头文件，我将流量发送到回送接口lo，如输出中所示。我正在将流量发送到回送接口，以消除网卡问题。 – jwbensley

我的理解是，因为'packet_tx_mmap'函数应该与内核共享一个缓冲区，这意味着多个数据包在一个sendto（）'系统调用中从userland复制到kernelland，所以向loopback接口发送流量意味着我们正在测试并且不用担心将数据包DMA传给NIC，这对于packet_tx和packet_tx_mmap都是相同的过程，因为这是进一步在内核堆栈之下。 – jwbensley

Linux内核的许多接口没有很好的记录。或者即使它们看起来有很好的文档记录，它们也可能非常复杂，并且很难理解界面的功能性，甚至更难以操作的特性。由于这个原因，任何想要深入了解内核API或需要使用内核API创建高性能应用程序的人都需要能够与内核代码联系才能成功。

在这种情况下，提问者想要了解通过共享内存接口（数据包mmap）将内核原始帧发送到内核的性能特征。

linux文档是here。它有一个陈旧的链接到“如何”，它现在可以发现here和包括packet_mmap.c复印件（我有一个可用的版本略有不同here。

的文档主要是面向对阅读，这是典型的用于使用数据包mmap的使用案例：从接口高效读取原始帧，例如：有效地从高速接口获得数据包捕获，几乎没有损失。

然而，OP对高性能编写感兴趣，这是一个非常不常见的用例，但对于似乎是OP想要处理的流量生成器/模拟器可能是有用的。谢天谢地，“如何做”就是写帧。

即便如此，关于如何实际工作的信息却很少，没有什么明显的帮助来回答OPs关于为什么使用数据包mmap似乎不会比不使用它快的问题，而是发送一个帧一次。

谢天谢地，内核源代码是开源的，编译良好，所以我们可以找到源代码来帮助我们得到问题的答案。

为了找到相关的内核代码，有几个关键字可以搜索，但PACKET_TX_RING脱颖而出作为此功能特有的套接字选项。在interwebs上搜索“PACKET_TX_RING linux交叉引用”会出现少量引用，其中包括af_packet.c，其中有一点检查似乎是执行所有AF_PACKET功能，包括数据包mmap。

翻阅af_packet.c，看起来用分组mmap传输的核心工作发生在tpacket_snd()。但这是正确的吗？我们如何判断这与我们的想法有什么关系？

一个非常强大的工具，用于从内核中获取这样的信息是SystemTap。（使用这需要你的内核安装调试符号。我碰巧使用Ubuntu和this是获得了SystemTap在Ubuntu上工作的秘诀。）

一旦你拥有了SystemTap的工作，你可以一起选择使用的SystemTap与packet_mmap.c到看看tpacket_snd()甚至通过核函数tpacket_snd安装探头，然后运行packet_mmap通过共享TX环发送帧调用：

$ sudo stap -e 'probe kernel.function("tpacket_snd") { printf("W00T!\n"); }' & 
[1] 19961 
$ sudo ./packet_mmap -c 1 eth0 
[...] 
STARTING TEST: 
data offset = 32 bytes 
start fill() thread 
send 1 packets (+150 bytes) 
end of task fill() 
Loop until queue empty (0) 
END (number of error:0) 
W00T! 
W00T!

W00T！我们正在做某件事;实际上正在调用tpacket_snd。但我们的胜利将是短暂的。如果我们继续尝试从股票内核构建中获取更多信息，SystemTap会抱怨它找不到我们想要检查的变量，并且函数参数将以?或ERROR的值打印出来。这是因为内核是通过优化编译的，并且AF_PACKET的所有功能都在单个翻译单元af_packet.c中定义;许多函数都由编译器内联，有效地丢失了局部变量和参数。

为了从af_packet.c中撬出更多信息，我们将不得不构建内核版本，其中af_packet.c在没有优化的情况下构建。请参阅here了解一些指导。我会等。

好的，希望这不是太难，你已经成功启动了一个内核，SystemTap可以从中获得大量的好消息。请记住，这个内核版本只是为了帮助我们弄清楚数据包mmap是如何工作的。我们无法从此内核获得任何直接的性能信息，因为af_packet.c构建的是而没有优化。如果事实证明我们需要获得优化版本的行为信息，那么我们可以用优化编译af_packet.c来构建另一个内核，但是添加了一些工具代码，通过变量公开信息，这些变量不会被优化，以便SystemTap可以看到他们。

所以让我们用它来获取一些信息。看看status.stp：

# This is specific to net/packet/af_packet.c 3.13.0-116 

function print_ts() { 
    ts = gettimeofday_us(); 
    printf("[%10d.%06d] ", ts/1000000, ts%1000000); 
} 

# 325 static void __packet_set_status(struct packet_sock *po, void *frame, int status) 
# 326 { 
# 327 union tpacket_uhdr h; 
# 328 
# 329 h.raw = frame; 
# 330 switch (po->tp_version) { 
# 331 case TPACKET_V1: 
# 332  h.h1->tp_status = status; 
# 333  flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 
# 334  break; 
# 335 case TPACKET_V2: 
# 336  h.h2->tp_status = status; 
# 337  flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 
# 338  break; 
# 339 case TPACKET_V3: 
# 340 default: 
# 341  WARN(1, "TPACKET version not supported.\n"); 
# 342  BUG(); 
# 343 } 
# 344 
# 345 smp_wmb(); 
# 346 } 

probe kernel.statement("[email protected]/packet/af_packet.c:334") { 
    print_ts(); 
    printf("SET(V1): %d (0x%.16x)\n", $status, $frame); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:338") { 
    print_ts(); 
    printf("SET(V2): %d\n", $status); 
} 

# 348 static int __packet_get_status(struct packet_sock *po, void *frame) 
# 349 { 
# 350 union tpacket_uhdr h; 
# 351 
# 352 smp_rmb(); 
# 353 
# 354 h.raw = frame; 
# 355 switch (po->tp_version) { 
# 356 case TPACKET_V1: 
# 357  flush_dcache_page(pgv_to_page(&h.h1->tp_status)); 
# 358  return h.h1->tp_status; 
# 359 case TPACKET_V2: 
# 360  flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 
# 361  return h.h2->tp_status; 
# 362 case TPACKET_V3: 
# 363 default: 
# 364  WARN(1, "TPACKET version not supported.\n"); 
# 365  BUG(); 
# 366  return 0; 
# 367 } 
# 368 } 

probe kernel.statement("[email protected]/packet/af_packet.c:358") { 
    print_ts(); 
    printf("GET(V1): %d (0x%.16x)\n", $h->h1->tp_status, $frame); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:361") { 
    print_ts(); 
    printf("GET(V2): %d\n", $h->h2->tp_status); 
} 

# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 
# 2089 { 
# [...] 
# 2136 do { 
# 2137  ph = packet_current_frame(po, &po->tx_ring, 
# 2138    TP_STATUS_SEND_REQUEST); 
# 2139 
# 2140  if (unlikely(ph == NULL)) { 
# 2141   schedule(); 
# 2142   continue; 
# 2143  } 
# 2144 
# 2145  status = TP_STATUS_SEND_REQUEST; 
# 2146  hlen = LL_RESERVED_SPACE(dev); 
# 2147  tlen = dev->needed_tailroom; 
# 2148  skb = sock_alloc_send_skb(&po->sk, 
# 2149    hlen + tlen + sizeof(struct sockaddr_ll), 
# 2150    0, &err); 
# 2151 
# 2152  if (unlikely(skb == NULL)) 
# 2153   goto out_status; 
# 2154 
# 2155  tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, 
# 2156     addr, hlen); 
# [...] 
# 2176  skb->destructor = tpacket_destruct_skb; 
# 2177  __packet_set_status(po, ph, TP_STATUS_SENDING); 
# 2178  atomic_inc(&po->tx_ring.pending); 
# 2179 
# 2180  status = TP_STATUS_SEND_REQUEST; 
# 2181  err = dev_queue_xmit(skb); 
# 2182  if (unlikely(err > 0)) { 
# [...] 
# 2195  } 
# 2196  packet_increment_head(&po->tx_ring); 
# 2197  len_sum += tp_len; 
# 2198 } while (likely((ph != NULL) || 
# 2199   ((!(msg->msg_flags & MSG_DONTWAIT)) && 
# 2200   (atomic_read(&po->tx_ring.pending)))) 
# 2201  ); 
# 2202 
# [...] 
# 2213 return err; 
# 2214 } 

probe kernel.function("tpacket_snd") { 
    print_ts(); 
    printf("tpacket_snd: args(%s)\n", $$parms); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2140") { 
    print_ts(); 
    printf("tpacket_snd:2140: current frame ph = 0x%.16x\n", $ph); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2141") { 
    print_ts(); 
    printf("tpacket_snd:2141: (ph==NULL) --> schedule()\n"); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2142") { 
    print_ts(); 
    printf("tpacket_snd:2142: flags 0x%x, pending %d\n", 
    $msg->msg_flags, $po->tx_ring->pending->counter); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2197") { 
    print_ts(); 
    printf("tpacket_snd:2197: flags 0x%x, pending %d\n", 
    $msg->msg_flags, $po->tx_ring->pending->counter); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2213") { 
    print_ts(); 
    printf("tpacket_snd: return(%d)\n", $err); 
} 

# 1946 static void tpacket_destruct_skb(struct sk_buff *skb) 
# 1947 { 
# 1948 struct packet_sock *po = pkt_sk(skb->sk); 
# 1949 void *ph; 
# 1950 
# 1951 if (likely(po->tx_ring.pg_vec)) { 
# 1952  __u32 ts; 
# 1953 
# 1954  ph = skb_shinfo(skb)->destructor_arg; 
# 1955  BUG_ON(atomic_read(&po->tx_ring.pending) == 0); 
# 1956  atomic_dec(&po->tx_ring.pending); 
# 1957 
# 1958  ts = __packet_set_timestamp(po, ph, skb); 
# 1959  __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); 
# 1960 } 
# 1961 
# 1962 sock_wfree(skb); 
# 1963 } 

probe kernel.statement("[email protected]/packet/af_packet.c:1959") { 
    print_ts(); 
    printf("tpacket_destruct_skb:1959: ph = 0x%.16x, ts = 0x%x, pending %d\n", 
    $ph, $ts, $po->tx_ring->pending->counter); 
}

定义一个函数（print_ts打印出Unix纪元的时间与微秒级分辨率）和一些探头。

首先我们定义探测器，当tx_ring中的数据包设置或读取状态时，打印出信息。接下来我们为tpacket_snd的回调函数和do {...} while (...)回路中的点定义探测器，处理tx_ring中的数据包。最后我们向skb析构函数添加一个探测器。

我们可以用sudo stap status.stp启动SystemTap脚本。然后运行sudo packet_mmap -c 2 <interface>通过接口发送2帧。下面是我从SystemTap的脚本得到的输出：

[1492581245.839850] tpacket_snd: args(po=0xffff88016720ee38 msg=0x14) 
[1492581245.839865] GET(V1): 1 (0xffff880241202000) 
[1492581245.839873] tpacket_snd:2140: current frame ph = 0xffff880241202000 
[1492581245.839887] SET(V1): 2 (0xffff880241202000) 
[1492581245.839918] tpacket_snd:2197: flags 0x40, pending 1 
[1492581245.839923] GET(V1): 1 (0xffff88013499c000) 
[1492581245.839929] tpacket_snd:2140: current frame ph = 0xffff88013499c000 
[1492581245.839935] SET(V1): 2 (0xffff88013499c000) 
[1492581245.839946] tpacket_snd:2197: flags 0x40, pending 2 
[1492581245.839951] GET(V1): 0 (0xffff88013499e000) 
[1492581245.839957] tpacket_snd:2140: current frame ph = 0x0000000000000000 
[1492581245.839961] tpacket_snd:2141: (ph==NULL) --> schedule() 
[1492581245.839977] tpacket_snd:2142: flags 0x40, pending 2 
[1492581245.839984] tpacket_snd: return(300) 
[1492581245.840077] tpacket_snd: args(po=0x0 msg=0x14) 
[1492581245.840089] GET(V1): 0 (0xffff88013499e000) 
[1492581245.840098] tpacket_snd:2140: current frame ph = 0x0000000000000000 
[1492581245.840093] tpacket_destruct_skb:1959: ph = 0xffff880241202000, ts = 0x0, pending 1 
[1492581245.840102] tpacket_snd:2141: (ph==NULL) --> schedule() 
[1492581245.840104] SET(V1): 0 (0xffff880241202000) 
[1492581245.840112] tpacket_snd:2142: flags 0x40, pending 1 
[1492581245.840116] tpacket_destruct_skb:1959: ph = 0xffff88013499c000, ts = 0x0, pending 0 
[1492581245.840119] tpacket_snd: return(0) 
[1492581245.840123] SET(V1): 0 (0xffff88013499c000)

，这里是网络捕获：

有很多在SystemTap中输出的有用信息。我们可以看到tpacket_snd获得了环中第一帧的状态（TP_STATUS_SEND_REQUEST为1），然后将其设置为TP_STATUS_SENDING（2）。它与第二个一样。下一帧的状态为TP_STATUS_AVAILABLE（0），它不是发送请求，所以它调用schedule()来产生并继续循环。由于没有更多的帧要发送（ph==NULL）并且已经请求了非阻塞（msg->msg_flags ==MSG_DONTWAIT），所以do {...} while (...)循环终止，并且tpacket_snd返回300，排队传输的字节数。

接下来，packet_mmap再次调用sendto（通过“循环直到队列空”代码），但在tx环中没有更多数据要发送，并且请求非阻塞，因此它立即返回0，因为不是数据已排队。请注意，它检查状态的框架与上次调用时检查的框架相同 - 它不是从tx环中的第一个框架开始，它检查了head（在用户空间中不可用）。

异步调用析构函数，首先在第一帧上，将帧的状态设置为TP_STATUS_AVAILABLE，然后递减未决计数，然后在第二帧上调用。请注意，如果未请求非阻塞，则在do {...} while (...)循环结束时的测试将等到所有未完成的数据包在返回之前已传输到NIC（假设它支持分散的数据）。您可以通过运行packet_mmap并使用-t选项来观察此选项，该选项用于使用阻塞I/O的“线程”（直到它进入“循环直到队列为空”）。

有几件事要注意。首先，SystemTap输出上的时间戳不会增加：从SystemTap输出推断时间顺序并不安全。其次，请注意网络捕获（本地完成）上的时间戳是不同的。 FWIW，这个界面在便宜的塔式电脑中是廉价的1G。

所以在这一点上，我想我们或多或少知道af_packet是如何处理共享tx环的。接下来的内容是tx环中的帧如何到达网络接口。查看linux网络内核中的控制流的overview的this section（处理层2传输的方式）可能会有帮助。 OK，所以如果你对2层传输的处理有一个基本的了解，看起来这个包mmap接口应该是一个巨大的消防水带;加载一个带有数据包的共享tx环，调用sendto()和MSG_DONTWAIT，然后tpacket_snd将遍历创建skb的tx队列并将它们排入qdisc。异步地，skb将从qdisc中出队并发送到硬件tx环。 skb应该是non-linear，所以他们会引用tx环中的数据而不是复制，而一个不错的现代NIC应该能够处理分散的数据并引用tx环中的数据。当然，这些假设中的任何一个都可能是错误的，所以我们试着用这种消防水带将大量的伤害转移到qdisc上。

但首先，关于qdisc如何工作的一个不常见的事实。它们拥有有限的数据量（通常以帧的数量计算，但在某些情况下，它可以以字节为单位），并且如果尝试将帧排入完整的qdisc，帧通常会被丢弃（取决于enqueuer决定这么做）。所以我会发出一个提示，我原来的假设是，OP使用数据包mmap将帧快速传输到qdisc，以至于很多被丢弃。但是不要太紧张，它会带你走向一个方向，但始终保持开放的态度。让我们试试看看会发生什么。

这样做的第一个问题是默认qdisc pfifo_fast不保留统计信息。所以让我们用qdisc pfifo替换它。默认pfifo将队列限制为TXQUEUELEN帧（通常默认为1000）。但是，因为我们想展示出压倒性一个队列规定，让我们明确地将其设置为50：

$ sudo tc qdisc add dev eth0 root pfifo limit 50 
$ tc -s -d qdisc show dev eth0 
qdisc pfifo 8004: root refcnt 2 limit 50p 
Sent 42 bytes 1 pkt (dropped 0, overlimits 0 requeues 0) 
backlog 0b 0p requeues 0

我们还测量需要多长时间来处理帧tpacket_snd与SystemTap的脚本call-return.stp：

# This is specific to net/packet/af_packet.c 3.13.0-116 

function print_ts() { 
    ts = gettimeofday_us(); 
    printf("[%10d.%06d] ", ts/1000000, ts%1000000); 
} 

# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 
# 2089 { 
# [...] 
# 2213 return err; 
# 2214 } 

probe kernel.function("tpacket_snd") { 
    print_ts(); 
    printf("tpacket_snd: args(%s)\n", $$parms); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2213") { 
    print_ts(); 
    printf("tpacket_snd: return(%d)\n", $err); 
}

开始与sudo stap call-return.stp的SystemTap的脚本，然后让高炉8096 1500字节帧成队列规定与微薄的50架容量：

$ sudo ./packet_mmap -c 8096 -s 1500 eth0 
[...] 
STARTING TEST: 
data offset = 32 bytes 
start fill() thread 
send 8096 packets (+12144000 bytes) 
end of task fill() 
Loop until queue empty (0) 
END (number of error:0)

因此，让我们检查多少数据包是由队列规定下降：

$ tc -s -d qdisc show dev eth0 
qdisc pfifo 8004: root refcnt 2 limit 50p 
Sent 25755333 bytes 8606 pkt (dropped 1, overlimits 0 requeues 265) 
backlog 0b 0p requeues 265

WAT？丢弃了8096帧中的一帧，转储到50帧qdisc上？让我们来看看SystemTap输出：

[1492603552.938414] tpacket_snd: args(po=0xffff8801673ba338 msg=0x14) 
[1492603553.036601] tpacket_snd: return(12144000) 
[1492603553.036706] tpacket_snd: args(po=0x0 msg=0x14) 
[1492603553.036716] tpacket_snd: return(0)

WAT？在tpacket_snd中处理8096帧花了将近100ms？让我们来看看实际需要传输多长时间;在1千兆/秒〜= 97毫秒时，这是8096帧，1500字节/帧。 WAT？它闻起来有些阻塞。

让我们仔细看看tpacket_snd。 Groan：

skb = sock_alloc_send_skb(&po->sk, 
       hlen + tlen + sizeof(struct sockaddr_ll), 
       0, &err);

0看起来非常无害，但实际上这是noblock参数。它应该是msg->msg_flags & MSG_DONTWAIT（原来这是fixed in 4.1）。这里发生的事情是qdisc的大小不是唯一的限制性资源。如果为skb分配空间将超出套接字sndbuf限制的大小，则此调用将阻止等待skb被释放，或者将-EAGAIN返回给非阻塞调用方。在V4.1的修复中，如果请求非阻塞，它将返回写入的字节数，如果非零，否则-EAGAIN给调用者，这几乎看起来像某人不希望你弄清楚如何使用这个（例如你填写一个80MB数据的tx环，调用sendto与MSG_DONTWAIT，你会得到一个结果，你发送150KB而不是EWOULDBLOCK）。所以如果你运行4.1之前的内核（我相信OP运行> 4.1并且不受这个bug的影响），你需要打补丁af_packet.c并构建一个新的内核或者升级到内核4.1或者更好。

我现在已经启动了内核的修补版本，因为我使用的机器运行的是3.13。虽然我们不会阻止如果sndbuf已满，我们仍然会以-EAGAIN返回。我对packet_mmap.c进行了一些更改，以增加sndbuf的默认大小，并在必要时使用SO_SNDBUFFORCE覆盖每个套接字的系统最大值（它似乎需要大约750个字节+每个帧的帧大小）。我还对call-return.stp进行了一些补充以记录sbbuf最大大小（sk_sndbuf），使用的数量（sk_wmem_alloc），sock_alloc_send_skb返回的任何错误和从dev_queue_xmit返回的排列skb到qdisc的任何错误。这是新版本：

# This is specific to net/packet/af_packet.c 3.13.0-116 

function print_ts() { 
    ts = gettimeofday_us(); 
    printf("[%10d.%06d] ", ts/1000000, ts%1000000); 
} 

# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) 
# 2089 { 
# [...] 
# 2133 if (size_max > dev->mtu + reserve + VLAN_HLEN) 
# 2134  size_max = dev->mtu + reserve + VLAN_HLEN; 
# 2135 
# 2136 do { 
# [...] 
# 2148  skb = sock_alloc_send_skb(&po->sk, 
# 2149    hlen + tlen + sizeof(struct sockaddr_ll), 
# 2150    msg->msg_flags & MSG_DONTWAIT, &err); 
# 2151 
# 2152  if (unlikely(skb == NULL)) 
# 2153   goto out_status; 
# [...] 
# 2181  err = dev_queue_xmit(skb); 
# 2182  if (unlikely(err > 0)) { 
# 2183   err = net_xmit_errno(err); 
# 2184   if (err && __packet_get_status(po, ph) == 
# 2185     TP_STATUS_AVAILABLE) { 
# 2186    /* skb was destructed already */ 
# 2187    skb = NULL; 
# 2188    goto out_status; 
# 2189   } 
# 2190   /* 
# 2191   * skb was dropped but not destructed yet; 
# 2192   * let's treat it like congestion or err < 0 
# 2193   */ 
# 2194   err = 0; 
# 2195  } 
# 2196  packet_increment_head(&po->tx_ring); 
# 2197  len_sum += tp_len; 
# 2198 } while (likely((ph != NULL) || 
# 2199   ((!(msg->msg_flags & MSG_DONTWAIT)) && 
# 2200   (atomic_read(&po->tx_ring.pending)))) 
# 2201  ); 
# [...] 
# 2213 return err; 
# 2214 } 

probe kernel.function("tpacket_snd") { 
    print_ts(); 
    printf("tpacket_snd: args(%s)\n", $$parms); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2133") { 
    print_ts(); 
    printf("tpacket_snd:2133: sk_sndbuf = %d sk_wmem_alloc = %d\n", 
    $po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2153") { 
    print_ts(); 
    printf("tpacket_snd:2153: sock_alloc_send_skb err = %d, sk_sndbuf = %d sk_wmem_alloc = %d\n", 
    $err, $po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2182") { 
    if ($err != 0) { 
    print_ts(); 
    printf("tpacket_snd:2182: dev_queue_xmit err = %d\n", $err); 
    } 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2187") { 
    print_ts(); 
    printf("tpacket_snd:2187: destructed: net_xmit_errno = %d\n", $err); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2194") { 
    print_ts(); 
    printf("tpacket_snd:2194: *NOT* destructed: net_xmit_errno = %d\n", $err); 
} 

probe kernel.statement("[email protected]/packet/af_packet.c:2213") { 
    print_ts(); 
    printf("tpacket_snd: return(%d) sk_sndbuf = %d sk_wmem_alloc = %d\n", 
    $err, $po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter); 
}

让我们再试一次：

$ sudo tc qdisc add dev eth0 root pfifo limit 50 
$ tc -s -d qdisc show dev eth0 
qdisc pfifo 8001: root refcnt 2 limit 50p 
Sent 2154 bytes 21 pkt (dropped 0, overlimits 0 requeues 0) 
backlog 0b 0p requeues 0 
$ sudo ./packet_mmap -c 200 -s 1500 eth0 
[...] 
c_sndbuf_sz:  1228800 
[...] 
STARTING TEST: 
data offset = 32 bytes 
send buff size = 1228800 
got buff size = 425984 
buff size smaller than desired, trying to force... 
got buff size = 2457600 
start fill() thread 
send: No buffer space available 
end of task fill() 
send: No buffer space available 
Loop until queue empty (-1) 
[repeated another 17 times] 
send 3 packets (+4500 bytes) 
Loop until queue empty (4500) 
Loop until queue empty (0) 
END (number of error:0) 
$ tc -s -d qdisc show dev eth0 
qdisc pfifo 8001: root refcnt 2 limit 50p 
Sent 452850 bytes 335 pkt (dropped 19, overlimits 0 requeues 3) 
backlog 0b 0p requeues 3

这里是SystemTap的输出：

[1492759330.907151] tpacket_snd: args(po=0xffff880393246c38 msg=0x14) 
[1492759330.907162] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 1 
[1492759330.907491] tpacket_snd:2182: dev_queue_xmit err = 1 
[1492759330.907494] tpacket_snd:2187: destructed: net_xmit_errno = -105 
[1492759330.907500] tpacket_snd: return(-105) sk_sndbuf = 2457600 sk_wmem_alloc = 218639 
[1492759330.907646] tpacket_snd: args(po=0x0 msg=0x14) 
[1492759330.907653] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 189337 
[1492759330.907688] tpacket_snd:2182: dev_queue_xmit err = 1 
[1492759330.907691] tpacket_snd:2187: destructed: net_xmit_errno = -105 
[1492759330.907694] tpacket_snd: return(-105) sk_sndbuf = 2457600 sk_wmem_alloc = 189337 
[repeated 17 times] 
[1492759330.908541] tpacket_snd: args(po=0x0 msg=0x14) 
[1492759330.908543] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 189337 
[1492759330.908554] tpacket_snd: return(4500) sk_sndbuf = 2457600 sk_wmem_alloc = 196099 
[1492759330.908570] tpacket_snd: args(po=0x0 msg=0x14) 
[1492759330.908572] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 196099 
[1492759330.908576] tpacket_snd: return(0) sk_sndbuf = 2457600 sk_wmem_alloc = 196099

现在事情正按预期;我们已经修复了一个导致我们阻止sndbuf限制的bug，并且我们已经调整了sndbuf限制，以便它不应该是一个约束，现在我们看到来自tx环的帧被排队到qdisc上，直到它满了，在这一点我们得到返回ENOBUFS。

接下来的问题是如何有效地保持发布到qdisc以保持界面繁忙。请注意，在我们填写qdisc并取回ENOBUFS的情况下，packet_poll的实现是无用的，因为它只是查询头是否为TP_STATUS_AVAILABLE，在这种情况下将保持TP_STATUS_SEND_REQUEST，直到后续调用sendto成功排队帧到qdisc。简单的权宜之计（在packet_mmap.c中更新）是在sendto上循环，直到成功或ENOBUFS或EAGAIN以外的错误。

无论如何，即使我们没有一个完整的解决方案来有效地防止NIC饿死，我们现在已经知道了足以回答OP问题的方法。根据我们所了解的情况，我们知道当OP在阻塞模式下用一个tx环呼叫sendto时，tpacket_snd将启动将skbs排入qdisc，直到超过sndbuf限制（并且默认通常很小，约213K ，并且进一步，我发现在共享tx环中引用的帧数据会被计数到这个时间），当它阻塞时（仍然持有pg_vec_lock）。随着skb的释放，更多的帧将被排队，也许sndbuf会再次超出，我们将再次阻止。最终，所有的数据都将被排队等待到qdisc，但是tpacket_snd将继续阻塞，直到所有的帧都被发送出去为止（你不能将tx环中的帧标记为可用，直到网卡已经收到它为止驱动器环中的skb引用tx环中的帧），同时仍然保持pg_vec_lock。在这一点上，NIC被饿死，其他任何套接字编写器都被锁定。另一方面，当OP每次发布一个数据包时，它将被packet_snd处理，如果在sndbuf中没有空间，然后将该帧排入qdisc并立即返回，将会被阻止。它不等待帧被传输。当qdisc被排空时，可以将其他帧排入队列。如果发行商能够跟上，那么NIC永远不会饿死。

此外，对于每次sendto调用，操作将复制到tx环中，并将其与不使用tx环时传递固定帧缓冲区的情况进行比较。你不会看到从这种方式复制的速度加快（尽管这不是使用tx环的唯一好处）。

来源

2017-04-15 14:57:24

感谢您的所有帮助，我现在正在使用调试符号和未优化版本的'af_packet.c'构建一个内核。当我们等待时，只是一些食物而已。我没有在我的应用程序中使用'MSG_DONTWAIT'标志。我没有试图使用非阻塞调用，所以使用'sock_alloc_send_skb（x，x，0，x）'得到了上面的代码片段 - 即使我们发现第三个参数没有被正确传递的错误，无论如何都应该是零？ – jwbensley

另外我有一个4.4.x的内核，但只要你有> = 3.14，如果你看看我已经放在Github上的代码，有一个套接字选项来绕过QDISC层，并跨越传输。一旦我构建了我的调试内核，我将为'packet_mmap.c'测试程序添加相同的套接字选项以查看其效果：'int bypass = 1;''int ret = setsockopt（sock_fd，SOL_PACKET，PACKET_QDISC_BYPASS，＆bypass，sizeof （旁路））;' – jwbensley

我的内核仍在编译，但从快速扫描'af_packet.c'我期待在L2695'err = po-> xmit（skb）;'指向'packet_direct_xmit（）'，而不是'dev_queue_xmit（）'，参见'af_packet.c'中的L3751。我在想[this]（http://lxr.free-electrons.com/source/net/packet/af_packet.c#L3751）指向[this]（http://lxr.free-electrons.com/ source/net/packet/af_packet.c＃L250），它指向[this]（http://lxr.free-electrons.com/source/include/linux/netdevice.h#L3970），它指向[this]（ http://lxr.free-electrons.com/source/drivers/net/ethernet/intel/igb/igb_main.c#L2143）。 – jwbensley

使用PACKET_MMAP和PACKET_TX_RING发送数据比“正常”慢（无）

回答

相关问题