Hello, network developers. I'm pleased to announce first pre-alpha version of the Zero-copy sniffer "device". It acts as packet socket, i.e. gets all packets using prot_hook.func(), but never copy it.
Basic idea behind zero-copy is remapping of the physical pages where skb->data lives to the userspace process. According to my tests, which can be found commented in the code (packet_mmap()), remapping of one page gets from 5 upto 20 times faster than copying the same amount of data (i.e. PAGE_SIZE). Since current VM code requires PTE to be unmapped, when remapping, but only exports unmap_mapping_range() and __flush_tlb(), I used them, although they are quite heavy monsters. It also required mm->mmap_sem to be held, so I placed main remapping code into workqueue. skbs are queued in prot_hook.func() and then workqueue is being scheduled, where skb is unlinked and remapped. It is not freed there, as it should be, since userspace will never found real data then, but instead some smart algo should be investigated to defer skb freeing, or simple defering using timer and redefined skb destructor. It also should remap several skbs at once, so rescheduling would not appeared very frequently. First mapped page is information page, where offset in page of the skb->data is placed, so userspace can detect where actual data lives on the next page. Such schema is very suitable for applications that do not require the whole data flow, but only select some data from the flow, based on packet content. I'm quite sure it will be slower than copying for small packets, so this two ideas must be combined to achieve the maximum sniffer performance. Current code is basically proof-of-concept, so it has tons of dirty quirks, and I'm not a VM hacker, so I would gladly listen your thoughts about the code and idea itself. Attached files: af_tlb.[ch] - kernel side sniffer implementation. tlb_test.c - userspace "sniffer". Makefile - build kernel side with "all" target and userspace with "test" target. Thank you. -- Evgeniy Polyakov
obj-m := af_tlb.o KDIR := /lib/modules/`uname -r`/build #KDIR := /usr/local/src/linux-2.6 PWD := $(shell pwd) UCFLAGS := -W -Wall default: $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules test: gcc $(UCFLAGS) tlb_test.c -o tlb_test clean: $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) clean @rm -f *~
/* * af_tlb.c * * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]> * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/config.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kmod.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/init.h> #include <linux/workqueue.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/fs.h> #include <linux/shm.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/swapops.h> #include <asm/io.h> #include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> #include "af_tlb.h" static void test_timer_func(void *data); static DECLARE_WORK(w, test_timer_func, NULL); static inline struct packet_sock *pkt_sk(struct sock *sk) { return (struct packet_sock *)sk; } static void packet_sock_destruct(struct sock *sk) { BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { printk("Attempt to release alive packet socket: %p\n", sk); return; } } static struct proto_ops packet_ops_spkt; static void dump_skb(struct sk_buff *skb) { struct ethhdr *eth; int i; printk(KERN_INFO "shared=%d, cloned=%d, type=%d, len=%d.\n", skb_shared(skb), skb_cloned(skb), skb->pkt_type, skb->len); eth = eth_hdr(skb); printk(KERN_INFO "MAC: proto=%04x, src=", eth->h_proto); for (i=0; i<ETH_ALEN-1; ++i) printk(KERN_INFO "%02x:", eth->h_source[i]); printk(KERN_INFO "%02x, dst=", eth->h_source[ETH_ALEN-1]); for (i=0; i<ETH_ALEN-1; ++i) printk(KERN_INFO "%02x:", eth->h_dest[i]); printk(KERN_INFO "%02x.\n", eth->h_dest[ETH_ALEN-1]); } static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { struct sock *sk; struct sockaddr_pkt *spkt; struct packet_sock *po; sk = pt->af_packet_priv; po = pkt_sk(sk); /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb->mac.raw * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto oom; /* drop any routing info */ dst_release(skb->dst); skb->dst = NULL; spkt = (struct sockaddr_pkt*)skb->cb; skb_push(skb, skb->data-skb->mac.raw); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) { if (test_bit(PACKET_MAPPED, &po->flags)) schedule_work(&w); return 0; } out: kfree_skb(skb); oom: return 0; } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; if (!sk) return 0; po = pkt_sk(sk); sk_del_node_init(sk); if (test_bit(PACKET_RUNNING, &po->flags)) { dev_remove_pack(&po->prot_hook); clear_bit(PACKET_RUNNING, &po->flags); __sock_put(sk); } sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_receive_queue); cancel_delayed_work(&w); flush_scheduled_work(); free_page(po->page); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol) { struct packet_sock *po = pkt_sk(sk); /* * Detach an existing hook if present. */ lock_sock(sk); spin_lock(&po->bind_lock); if (test_bit(PACKET_RUNNING, &po->flags)) { __sock_put(sk); clear_bit(PACKET_RUNNING, &po->flags); po->num = 0; spin_unlock(&po->bind_lock); dev_remove_pack(&po->prot_hook); spin_lock(&po->bind_lock); } po->num = protocol; po->prot_hook.type = protocol; po->prot_hook.dev = dev; po->ifindex = dev ? dev->ifindex : 0; if (protocol == 0) goto out_unlock; if (dev) { if (dev->flags&IFF_UP) { dev_add_pack(&po->prot_hook); sock_hold(sk); set_bit(PACKET_RUNNING, &po->flags); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); } } else { dev_add_pack(&po->prot_hook); sock_hold(sk); set_bit(PACKET_RUNNING, &po->flags); } out_unlock: spin_unlock(&po->bind_lock); release_sock(sk); return 0; } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk=sock->sk; char name[15]; struct net_device *dev; int err = -ENODEV; strlcpy(name, uaddr->sa_data, sizeof(name)); printk( "%s: name=%s.\n", __func__, name); if(addr_len!=sizeof(struct sockaddr)) return -EINVAL; dev = dev_get_by_name(name); if (dev) { err = packet_do_bind(sk, dev, pkt_sk(sk)->num); dev_put(dev); } return err; } static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { switch(cmd) { default: return dev_ioctl(cmd, (void __user *)arg); } return 0; } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; static int packet_create(struct socket *sock, int protocol) { struct sock *sk; struct packet_sock *po; int err; if (!capable(CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); if (sk == NULL) goto out; sock->ops = &packet_ops_spkt; sock_init_data(sock, sk); po = pkt_sk(sk); sk->sk_family = PF_PACKET; po->num = protocol; sk->sk_destruct = packet_sock_destruct; po->flags = 0; po->budget = 1; spin_lock_init(&po->bind_lock); po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; if (protocol) { po->prot_hook.type = protocol; dev_add_pack(&po->prot_hook); sock_hold(sk); set_bit(PACKET_RUNNING, &po->flags); } printk( "%s: protocol=%d.\n", __func__, protocol); return 0; out: return err; } static void packet_mm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct inode *inode = file->f_dentry->d_inode; struct socket * sock = SOCKET_I(inode); struct sock *sk = sock->sk; printk( "%s, sk=%p.\n", __func__, sk); } static void packet_mm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct inode *inode = file->f_dentry->d_inode; struct socket *sock = SOCKET_I(inode); struct sock *sk = sock->sk; printk( "%s, sk=%p.\n", __func__, sk); if (vma->vm_file) unmap_mapping_range(vma->vm_file->f_mapping, 0, 0, 1); if (sk) { struct packet_sock *po = pkt_sk(sk); if (po && po->tsk) { po->tsk = NULL; clear_bit(PACKET_MAPPED, &po->flags); } } } static struct vm_operations_struct packet_mmap_ops = { .open = packet_mm_open, .close = packet_mm_close, }; struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; #ifdef CONFIG_HIGHPTE pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); #else pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); #endif return pte; } static int update_address(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; if (is_vm_hugetlb_page(vma)) { printk("Is it even possible here?.\n"); return -1; } pgd = pgd_offset(mm, address); spin_lock(&mm->page_table_lock); pud = pud_alloc(mm, pgd, address); if (!pud) goto oom; pmd = pmd_alloc(mm, pud, address); if (!pmd) goto oom; if (!pmd_present(*pmd)) { struct page *new; printk("PMD for 0x%lx is not presented.\n", address); spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); spin_lock(&mm->page_table_lock); if (!new) { printk("PTE allocation for 0x%lx failed.\n", address); goto oom; } /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ if (pmd_present(*pmd)) { printk("PMD for 0x%lx is presented.\n", address); pte_free(new); goto out; } mm->nr_ptes++; inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } pte = pte_offset_map(pmd, address); if (!pte_none(*pte)) printk("pte %p for 0x%lx exists.\n", pte, address); #if 1 pte_mkyoung(*pte); if (!pte_none(*pte)) printk("pte %p for 0x%lx exists after pte_mkyoung().\n", pte, address); pte_unmap(pte); if (!pte_none(*pte)) printk("pte %p for 0x%lx exists after pte_unmap().\n", pte, address); #endif out: spin_unlock(&mm->page_table_lock); printk("%s: exiting.\n", __func__); return 0; oom: spin_unlock(&mm->page_table_lock); return -1; } static void test_timer_func(void *data) { struct sock *sk = (struct sock *)data; struct packet_sock *po; struct packet_shared *ps; struct sk_buff *skb; unsigned long virt, start, end; int num = 0; if (!sk) return; po = pkt_sk(sk); if (!po || !po->tsk || !test_bit(PACKET_RUNNING, &po->flags) || !test_bit(PACKET_MAPPED, &po->flags)) return; start = po->vma->vm_start; end = po->vma->vm_end; down_write(&po->tsk->mm->mmap_sem); /* * This actually should not be flush_tlb(), * but it is only one call that can be used in modules. * --zbr */ __flush_tlb(); //update_address(po->vma->vm_mm, po->vma, __pa(virt)); if (po->vma->vm_file) { unmap_mapping_range(po->vma->vm_file->f_mapping, PAGE_SIZE, 0, 0); } ps = (struct packet_shared *)po->page; start += PAGE_SIZE; while ((skb = skb_dequeue(&sk->sk_receive_queue)) && ++num <= po->budget && start < end) { virt = (unsigned long)skb->mac.raw; if (!virt) goto out; if (0) { int i; printk("offset=%lu, users=%d, dataref=%d.\n", offset_in_page(virt), atomic_read(&skb->users), atomic_read(&skb_shinfo(skb)->dataref)); for (i=0; i<32; ++i) printk("%02x ", ((unsigned char *)virt)[i]); printk("\n"); } ps->offset = offset_in_page(virt); //dump_skb(skb); SetPageReserved(virt_to_page(virt)); if (remap_pfn_range(po->vma, start, __pa(virt) >> PAGE_SHIFT, PAGE_SIZE, po->vma->vm_page_prot)) { ClearPageReserved(virt_to_page(virt)); goto out; } start += PAGE_SIZE; out: /* * Actually here should be some smart algo, which will defer skb freeing * until userspace "read" it, so userspace should provide some kind of callback, * which will require write permisions to the area, so it should be splitted. * Or better just to free it after some timeout, say 100 msec should be enough. * --zbr */ //kfree_skb(skb); continue; } up_write(&po->tsk->mm->mmap_sem); } static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned long size = vma->vm_end - vma->vm_start; int err = 0; printk( "%s: size=0x%lx\n", __func__, size); #if 0 { int i; struct timeval tv1, tv2; unsigned long start = vma->vm_start; u8 *data1, *data2; do_gettimeofday(&tv1); for (i=0; i<1000; i++) { if (remap_pfn_range(vma, start, __pa(PAGE_OFFSET) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) break; __flush_tlb(); if (vma->vm_file) unmap_mapping_range(vma->vm_file->f_mapping, 0, 0, 1); start += PAGE_SIZE; } do_gettimeofday(&tv2); printk("%s: 1000 remaps took %lu usec.\n", __func__, (tv2.tv_sec - tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec); data1 = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!data1) return -ENOMEM; data2 = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!data2) { kfree(data2); return -ENOMEM; } do_gettimeofday(&tv1); for (i=0; i<1000; i++) { memcpy(data1, ((void *)sock)+i*PAGE_SIZE, PAGE_SIZE); } do_gettimeofday(&tv2); printk("%s: 1000 copyings took %lu usec.\n", __func__, (tv2.tv_sec - tv1.tv_sec)*1000000 + tv2.tv_usec - tv1.tv_usec); kfree(data1); kfree(data2); } #endif vma->vm_ops = &packet_mmap_ops; lock_sock(sk); po->budget = size / PAGE_SIZE; po->tsk = current; if (!po->tsk) { err = -ENODEV; goto err_out_unlock; } po->page = __get_free_page(GFP_KERNEL); if (!po->page) { err = -ENOMEM; goto err_out_unlock; } memset((void *)po->page, 0, PAGE_SIZE); SetPageReserved(virt_to_page(po->page)); if (remap_pfn_range(vma, vma->vm_start, __pa(po->page) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) { ClearPageReserved(virt_to_page(po->page)); err = -EIO; goto err_out_unlock; } po->vma = vma; release_sock(sk); INIT_WORK(&w, test_timer_func, sk); set_bit(PACKET_MAPPED, &po->flags); return 0; err_out_unlock: release_sock(sk); return err; } static struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = sock_no_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = packet_mmap, .sendpage = sock_no_sendpage, }; static struct net_proto_family packet_family_ops = { .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static void __exit packet_exit(void) { sock_unregister(PF_PACKET); proto_unregister(&packet_proto); } static int __init packet_init(void) { int rc = proto_register(&packet_proto, 0); if (rc != 0) goto out; sock_register(&packet_family_ops); printk("%s: initialized at %lu.\n", __func__, jiffies); out: return rc; } module_init(packet_init); module_exit(packet_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET);
/* * af_tlb.h * * 2005 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]> * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __AF_TLB_H #define __AF_TLB_H struct packet_shared { __u16 offset; }; #ifdef __KERNEL__ enum packet_flags { PACKET_RUNNING = 0, PACKET_MAPPED, }; struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; struct tpacket_stats stats; struct packet_type prot_hook; spinlock_t bind_lock; long flags; int ifindex; unsigned short num; struct vm_area_struct *vma; struct task_struct *tsk; int budget; unsigned long page; }; #endif /* __KERNEL__ */ #endif /* __AF_TLB_H */
#include <sys/types.h> #include <sys/socket.h> #include <sys/mman.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include <errno.h> #include <unistd.h> #include <netinet/in.h> #include <netinet/ip.h> #include <net/ethernet.h> #include <linux/if_ether.h> #include <linux/types.h> #include "af_tlb.h" #define PAGE_SIZE 4096 static size_t mmap_size = 2*PAGE_SIZE; #define ulog(f, a...) do { fprintf(stderr, f, ##a); fflush(stderr); } while (0) #define NIPQUAD(addr) \ ((unsigned char *)&addr)[0], \ ((unsigned char *)&addr)[1], \ ((unsigned char *)&addr)[2], \ ((unsigned char *)&addr)[3] static int dump_network(__u16 offset, void *ptr) { struct ether_header *eth = ptr; struct iphdr *ip; int i; unsigned short ether_type; ulog("offset=%x: ", offset); ether_type = ntohs(eth->ether_type); if (ether_type != ETH_P_IP && ether_type != ETH_P_ARP) { ulog("\n"); return -1; } ulog("MAC: proto=%04x, src=", eth->ether_type); for (i=0; i<ETH_ALEN-1; ++i) ulog("%02x:", eth->ether_shost[i]); ulog("%02x, dst=", eth->ether_shost[ETH_ALEN-1]); for (i=0; i<ETH_ALEN-1; ++i) ulog("%02x:", eth->ether_dhost[i]); ulog("%02x. ", eth->ether_dhost[ETH_ALEN-1]); if (ether_type != ETH_P_IP) { ulog("\n"); return 0; } ip = (struct iphdr *)(ptr + sizeof(*eth)); ulog("%u.%u.%u.%u -> %u.%u.%u.%u.\n", NIPQUAD(ip->saddr), NIPQUAD(ip->daddr)); return 0; } static void dump_data(void *ptr, __u16 offset, int size) { int i; unsigned char *data = ptr + PAGE_SIZE + offset; ulog("%p: ", ptr); for (i=0; i<size; ++i) ulog("%02x ", data[i]); ulog("\n"); } int main(int argc, char *argv[]) { struct sockaddr sa; int s, err; socklen_t len = sizeof(sa); void *mmap_ptr; struct packet_shared *ps; if (argc > 1) memcpy(sa.sa_data, argv[1], sizeof(sa.sa_data)); else memcpy(sa.sa_data, "eth0", sizeof(sa.sa_data)); s = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); if (s == -1) { ulog("Failed to create PF_PACKET socket: %s [%d].\n", strerror(errno), errno); return -1; } mmap_ptr = mmap(NULL, mmap_size, PROT_READ, MAP_SHARED, s, 0); if (mmap_ptr == MAP_FAILED) { ulog("Failed to map socket %d: %s [%d].\n", s, strerror(errno), errno); err = -errno; goto err_out_close; } err = bind(s, &sa, len); if (err == -1) { ulog("Failed to bind socket %d to device %s: %s [%d].\n", s, sa.sa_data, strerror(errno), errno); goto err_out_unmap; } ps = (struct packet_shared *)mmap_ptr; while (1) { err = dump_network(ps->offset, mmap_ptr + PAGE_SIZE + ps->offset); if (err && ps->offset) dump_data(mmap_ptr, ps->offset, 32); } err = 0; err_out_unmap: munmap(mmap_ptr, mmap_size); err_out_close: close(s); return err; }