I've spent the last week or so trying to track down a recurring problem I'm seeing with UDP datagram handling. I'm new to the internals of the Linux network stack, but it appears to me that there's a substantial error in recent kernels' handling of UDP checksum errors.
The behavior I'm seeing: I have a UDP server that receives lots of datagrams from many devices on a single port. A small number of those devices occasionally send packets with bad UDP checksums. After I receive one of these bad packets, the next recvmsg() made on the socket returns data from the bad-checksum packet, but the source-address and length of the next (good) packet that arrived at the port. I believe this issue was introduced between linux 3.18 and 3.19, by a set of changes to net/core/datagram.c that made skb_copy_and_csum_datagram_msg() and related functions use the iov_iter structure to copy data to user buffers. In the case where those functions copy a datagram and then conclude that the checksum is invalid, they don't remove the already-copied data from the user's iovec; when udp_recvmsg() calls skb_copy_and_csum_datagram_msg() for a second time, looking at the next datagram on the queue, that second datagram's data is appended to the first datagram's. So when recvmsg() returns to the user, the return value and msg_name reflect the second datagram, but the first bytes in the user's iovec come from the first (bad) datagram. (I've attached a test program that demonstrates the problem. Note that it sees correct behavior unless the bad-checksum packet has > 68 bytes of UDP data; otherwise, the packet doesn't make it past the CHECKSUM_BREAK test, and never enters skp_copy_and_csum_datagram_msg().) The fix for UDP seems pretty simple; the iov_iter's iov_offset member just needs to be set back to zero on a checksum failure. But it looks like skb_copy_and_csum_datagram_msg() is also called from tcp_input.c, where I assume that multiple sk_buffs can be copied-and-csum'd into the same iov -- if that's right, it seems like iov_iter needs some additional state to support rolling-back the most recent copy without losing previous ones. Any thoughts?
#include <string.h> #include <unistd.h> #include <netdb.h> #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <sys/socket.h> #include <netinet/in.h> #include <sys/uio.h> #include <arpa/inet.h> #include <netinet/udp.h> int main(int argc, char **argv) { int ret; if (argc < 2) { fprintf(stderr, "Usage: csumtest <bytes-in-bad-packet>\n"); exit(1); } struct sockaddr_in addr; socklen_t addrLen = sizeof(addr); addr.sin_family = AF_INET; addr.sin_addr.s_addr = inet_addr("127.0.0.1"); addr.sin_port = htons(0); int listen = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); if (listen < 0) { fprintf(stderr, "failed to create listening socket (err=%d: %s)\n", errno, strerror(errno)); exit(1); } ret = bind(listen, (struct sockaddr *) &addr, addrLen); if (ret != 0) { fprintf(stderr, "failed to bind listening socket socket (err=%d: %s)\n", errno, strerror(errno)); exit(1); } ret = getsockname(listen, (struct sockaddr *) &addr, &addrLen); if (ret != 0) { fprintf(stderr, "getsockname failed for listening socket socket (err=%d: %s)\n", errno, strerror(errno)); exit(1); } else { printf("listening on port %d\n", ntohs(addr.sin_port)); } // Send a packet with deliberately bad checksum char rawBuf[4096]; memset(rawBuf, 0, 4096); struct udphdr *udph = (struct udphdr *) rawBuf; // Send as UDP data the string "BAD DATA", repeated as necessary // to fill the number of bytes requested on the command-line char *data = rawBuf + sizeof(struct udphdr); int badBytes = atoi(argv[1]); int i; for (i = 0; i < badBytes; i += 8) { strncpy(data + i, "BAD DATA", 8); } // UDP headers contain a bogus checksum, but are otherwise reasonable udph->check = htons(0xbadd); udph->source = htons(18); udph->dest = addr.sin_port; udph->len = htons(sizeof(struct udphdr) + badBytes); int raw = socket(AF_INET, SOCK_RAW, IPPROTO_UDP); ret = sendto(raw, rawBuf, sizeof(struct udphdr) + badBytes, 0, (struct sockaddr *) &addr, addrLen); if (ret < 0) { fprintf(stderr, "raw send failed (err=%d: %s)\n", errno, strerror(errno)); exit(1); } // Send a second, legitimate UDP packet int cooked = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); char *second = "Good data"; ret = sendto(cooked, second, strlen(second), 0, (struct sockaddr *) &addr, addrLen); if (ret < 0) { fprintf(stderr, "second cooked send failed (err=%d: %s)\n", errno, strerror(errno)); exit(1); } // Read what's queued up for us. while (1) { struct msghdr msg; struct iovec iov; char readBuf[4096]; ssize_t bytes; memset(readBuf, 0, 4096); iov.iov_base = readBuf; iov.iov_len = sizeof(readBuf); msg.msg_name = NULL; msg.msg_namelen = 0; msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; bytes = recvmsg(listen, &msg, 0); if (bytes < 0) { fprintf(stderr, "getsockname failed for listening socket socket (err=%d: %s)\n", errno, strerror(errno)); exit(1); } printf("recvmsg returned %ld bytes: %s\n", bytes, readBuf); if (bytes == 9) break; } }