On Sat, May 2, 2020 at 4:29 AM Lese Doru Calin <lesedorucali...@gmail.com> wrote: > > In this year's edition of GSoC, there is a project idea for CRIU to add > support > for checkpoint/restore of cork-ed UDP sockets. But to add it, the kernel API > needs > to be extended. > This is what this patch does. It adds a new command, called SIOUDPPENDGET, to > the > ioctl syscall regarding UDP sockets, which stores the pending data from the > write > queue and the destination address in a struct msghdr. The arg for ioctl needs > to > be a pointer to a user space struct msghdr. The syscall returns the number of > writed > bytes, if successful, or error. To retrive the data requires the CAP_NET_ADMIN > capability. > > Signed-off-by: Lese Doru Calin <lesedorucali...@gmail.com>
A few concerns: - why call the BPF recvmsg hook from this ioctl function? - The patch saves msg_name, but not other relevant state stored in inet_cork, such as state passed through cmsg. Without that, this might introduce more subtle bugs than not checkpointing at all. - Duplicating usercopy code from net/socket.c is fragile and adds maintenance burden. If anything such refactoring should stay inside that file. To some extent the same applies to udp_peek_sndq and net/core/datagram.c. Less important - a getsockopt is generally preferred over extending ioctl. Overall, I'm not sure that this is the right approach. It is quite a bit of code, for a mostly hypothetical omission to CRIU? Since these are unreliable datagrams, it is arguably sufficient to just drop a datagram if checkpoint/restore happened on a corked socket. That might be simpler. As long as CRIU is behind a static branch and thus adds no cycles in the common hot path, I personally don't mind having it in the normal send/recv path as much -- if that allows it to reuse existing code, e.g., for copy to user. The MSG_ERRQUEUE path in particular is already a slow path to loop packets from the send patch back to the sending process. > --- > include/linux/socket.h | 2 + > include/uapi/linux/sockios.h | 3 + > net/ipv4/udp.c | 145 +++++++++++++++++++++++++++++++---- > net/socket.c | 4 +- > 4 files changed, 139 insertions(+), 15 deletions(-) > > diff --git a/include/linux/socket.h b/include/linux/socket.h > index 54338fac45cb..632ba0ea6709 100644 > --- a/include/linux/socket.h > +++ b/include/linux/socket.h > @@ -351,6 +351,8 @@ struct ucred { > #define IPX_TYPE 1 > > extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct > sockaddr_storage *kaddr); > +extern int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, > + void __user *uaddr, int __user *ulen); > extern int put_cmsg(struct msghdr*, int level, int type, int len, void > *data); > > struct timespec64; > diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h > index 7d1bccbbef78..3639fa906604 100644 > --- a/include/uapi/linux/sockios.h > +++ b/include/uapi/linux/sockios.h > @@ -153,6 +153,9 @@ > #define SIOCSHWTSTAMP 0x89b0 /* set and get config */ > #define SIOCGHWTSTAMP 0x89b1 /* get config */ > > +/* UDP socket calls*/ > +#define SIOUDPPENDGET 0x89C0 /* get the pending data from write queue */ > + > /* Device private ioctl calls */ > > /* > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index 32564b350823..f729a5e7f90b 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -1620,6 +1620,133 @@ static int first_packet_length(struct sock *sk) > return res; > } > > +static void udp_set_source_addr(struct sock *sk, struct msghdr *msg, > + int *addr_len, u32 addr, u16 port) > +{ > + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); > + > + if (sin) { > + sin->sin_family = AF_INET; > + sin->sin_port = port; > + sin->sin_addr.s_addr = addr; > + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); > + *addr_len = sizeof(*sin); > + > + if (cgroup_bpf_enabled) > + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, > + (struct sockaddr *)sin); > + } > +} > + > +static int udp_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int > len) > +{ > + int copy, copied = 0, err = 0; > + struct sk_buff *skb; > + > + skb_queue_walk(&sk->sk_write_queue, skb) { > + copy = len - copied; > + if (copy > skb->len - off) > + copy = skb->len - off; > + > + err = skb_copy_datagram_msg(skb, off, msg, copy); > + if (err) > + break; > + > + copied += copy; > + if (len <= copied) > + break; > + } > + return err ?: copied; > +} > + > +static int udp_get_pending_write_queue(struct sock *sk, struct msghdr *msg, > + int *addr_len) > +{ > + int err = 0, off = sizeof(struct udphdr); > + struct inet_sock *inet = inet_sk(sk); > + struct udp_sock *up = udp_sk(sk); > + struct flowi4 *fl4; > + struct flowi6 *fl6; > + > + switch (up->pending) { > + case 0: > + return -ENODATA; > + case AF_INET: > + off += sizeof(struct iphdr); > + fl4 = &inet->cork.fl.u.ip4; > + udp_set_source_addr(sk, msg, addr_len, > + fl4->daddr, fl4->fl4_dport); > + break; > + case AF_INET6: > + off += sizeof(struct ipv6hdr); > + if (msg->msg_name) { > + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, > + msg->msg_name); > + > + fl6 = &inet->cork.fl.u.ip6; > + sin6->sin6_family = AF_INET6; > + sin6->sin6_port = fl6->fl6_dport; > + sin6->sin6_flowinfo = 0; > + sin6->sin6_addr = fl6->daddr; > + sin6->sin6_scope_id = fl6->flowi6_oif; > + *addr_len = sizeof(*sin6); > + > + if (cgroup_bpf_enabled) > + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, > + (struct sockaddr *)sin6); > + } > + break; > + default: > + return -EINVAL; > + } > + > + lock_sock(sk); > + if (unlikely(!up->pending)) { > + release_sock(sk); > + return -EINVAL; > + } > + err = udp_peek_sndq(sk, msg, off, msg_data_left(msg)); > + release_sock(sk); > + return err; > +} > + > +static int prep_msghdr_recv_pending(struct sock *sk, void __user *argp) > +{ > + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; > + struct user_msghdr __user *msg; > + struct sockaddr __user *uaddr; > + struct sockaddr_storage addr; > + struct msghdr msg_sys; > + int __user *uaddr_len; > + int err = 0, len = 0; > + > + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) > + return -EPERM; > + > + if (!argp) > + return -EINVAL; > + > + msg = (struct user_msghdr __user *)argp; > + err = recvmsg_copy_msghdr(&msg_sys, msg, 0, &uaddr, &iov); > + if (err < 0) > + return err; > + > + uaddr_len = &msg->msg_namelen; > + msg_sys.msg_name = &addr; > + msg_sys.msg_flags = 0; > + > + err = udp_get_pending_write_queue(sk, &msg_sys, &len); > + msg_sys.msg_namelen = len; > + len = err; > + > + if (uaddr && err >= 0) > + err = move_addr_to_user(&addr, msg_sys.msg_namelen, > + uaddr, uaddr_len); > + > + kfree(iov); > + return err < 0 ? err : len; > +} > + > /* > * IOCTL requests applicable to the UDP protocol > */ > @@ -1641,6 +1768,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long > arg) > return put_user(amount, (int __user *)arg); > } > > + case SIOUDPPENDGET: > + return prep_msghdr_recv_pending(sk, (void __user *)arg); > + > default: > return -ENOIOCTLCMD; > } > @@ -1729,7 +1859,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, > size_t len, int noblock, > int flags, int *addr_len) > { > struct inet_sock *inet = inet_sk(sk); > - DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); > struct sk_buff *skb; > unsigned int ulen, copied; > int off, err, peeking = flags & MSG_PEEK; > @@ -1794,18 +1923,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, > size_t len, int noblock, > > sock_recv_ts_and_drops(msg, sk, skb); > > - /* Copy the address. */ > - if (sin) { > - sin->sin_family = AF_INET; > - sin->sin_port = udp_hdr(skb)->source; > - sin->sin_addr.s_addr = ip_hdr(skb)->saddr; > - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); > - *addr_len = sizeof(*sin); > - > - if (cgroup_bpf_enabled) > - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, > - (struct sockaddr > *)sin); > - } > + udp_set_source_addr(sk, msg, addr_len, ip_hdr(skb)->saddr, > + udp_hdr(skb)->source); > > if (udp_sk(sk)->gro_enabled) > udp_cmsg_recv(msg, sk, skb); > diff --git a/net/socket.c b/net/socket.c > index 2dd739fba866..bd25d528c9a0 100644 > --- a/net/socket.c > +++ b/net/socket.c > @@ -217,8 +217,8 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, > struct sockaddr_storage *k > * specified. Zero is returned for a success. > */ > > -static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, > - void __user *uaddr, int __user *ulen) > +int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, > + void __user *uaddr, int __user *ulen) > { > int err; > int len; > -- > 2.17.1 >