On Fri, Oct 20, 2006 at 01:53:05PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED]) wrote: > Netchannel [1] is pure bridge between low-level hardware and user, without any > special protocol processing involved between them. > Users are not limited to userspace only - I will use this netchannel > infrastructure for fast NAT implementation, which is purely kernelspace user > (although it is possible to create NAT in userspace, but price of the > kernelspace board crossing is too high, which only needs to change some > fields > in the header and recalculate checksum). > Userspace network stack [2] is another user of the new netchannel subsystem. > > Current netchannel version supports data transfer using copy*user().
Performance graph (speed and CPU usage) attached. Benchmark uses 128 bytes sending/receiving per syscall (no latency checks, only throughput. MB and KB mean not 1000, but 1024. Receiving is about 8 MB/sec faster. Receiving CPU usage is 3 times less (90% socket code vs. 30% netchannels+unetstack). Sending is 10 MB/sec faster. Sending CPU usage is 5 times less (upto 50% vs. upto 10%). Number of syscalls is about 10 times less for netchannels. Hardware. System 1. Netchannel kernel (2.6.19-rc3-git) or vanilla 2.6.19-rc3/2.6.18-1.2200.fc5. amd64 athlon 3500+ cpu 1gb ram r8169 nic System 2. 2.6.17-2-686 debian etch intel core duo 3.40GHz 2 gb ram Marvell Technology Group Ltd. 88E8053 PCI-E Gigabit Ethernet Controller (sky2 driven) All software used in tests (tcp_client.c/tcp_test.c and userspace network stack) can be found on project's hompages (userspace network stack requires increased window scaling factor than default). Consider for inclusion netchannel subsystem. 1. Netchannels homepage. http://tservice.net.ru/~s0mbre/old/?section=projects&item=netchannel 2. Userspace network stack homapage. http://tservice.net.ru/~s0mbre/old/?section=projects&item=unetstack Signed-off-by: Evgeniy Polyakov <[EMAIL PROTECTED]> diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 2697e92..3231b22 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -319,3 +319,4 @@ ENTRY(sys_call_table) .long sys_move_pages .long sys_getcpu .long sys_epoll_pwait + .long sys_netchannel_control diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index b4aa875..d35d4d8 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -718,4 +718,5 @@ #endif .quad compat_sys_vmsplice .quad compat_sys_move_pages .quad sys_getcpu + .quad sys_netchannel_control ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index beeeaf6..33242f8 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -325,10 +325,11 @@ #define __NR_vmsplice 316 #define __NR_move_pages 317 #define __NR_getcpu 318 #define __NR_epoll_pwait 319 +#define __NR_netchannel_control 320 #ifdef __KERNEL__ -#define NR_syscalls 320 +#define NR_syscalls 321 #include <linux/err.h> /* diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 777288e..16f1aac 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,8 +619,10 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_netchannel_control 280 +__SYSCALL(__NR_netchannel_control, sys_netchannel_control) -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_netchannel_control #ifdef __KERNEL__ #include <linux/err.h> diff --git a/include/linux/netchannel.h b/include/linux/netchannel.h new file mode 100644 index 0000000..23e9f1e --- /dev/null +++ b/include/linux/netchannel.h @@ -0,0 +1,88 @@ +/* + * netchannel.h + * + * 2006 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]> + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __NETCHANNEL_H +#define __NETCHANNEL_H + +#include <linux/types.h> + +enum netchannel_commands { + NETCHANNEL_CREATE = 0, + NETCHANNEL_RECV, + NETCHANNEL_SEND, +}; + +enum netchannel_type { + NETCHANNEL_COPY_USER = 0, + NETCHANNEL_NTA, +}; + +struct unetchannel +{ + __u32 faddr, laddr; /* foreign/local hashes */ + __u16 fport, lport; /* foreign/local ports */ + __u8 proto; /* IP protocol number */ + __u8 copy:3, /* Netchannel type: copy_to_user, mmap or something */ + state:5; /* Some initial state */ + __u8 memory_limit_order; /* Memor limit order */ + __u8 init_stat_work; /* Start statistic dumping */ +}; + +struct unetchannel_control +{ + struct unetchannel unc; + __u32 cmd; + __u16 len, header_len; + __u32 flags; + __u32 timeout; + int fd; +}; + +#ifdef __KERNEL__ + +struct netchannel +{ + struct rb_node netchannel_node; + atomic_t refcnt; + struct rcu_head rcu_head; + struct unetchannel unc; + unsigned long hit; + + struct page * (*nc_alloc_page)(unsigned int size); + void (*nc_free_page)(struct page *page); + int (*nc_recv_data)(struct netchannel *, unsigned int *timeout, __u16 *len, void __user *arg); + int (*nc_send_data)(struct netchannel *, unsigned int *timeout, __u16 len, __u16 header_len, void __user *arg); + + struct sk_buff_head recv_queue; + wait_queue_head_t wait; + + unsigned long qlen; + + struct work_struct work; + + struct dst_entry *dst; +}; + +#define NETCHANNEL_MAX_ORDER 31 +#define NETCHANNEL_MIN_ORDER PAGE_SHIFT + +#endif /* __KERNEL__ */ +#endif /* __NETCHANNEL_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9264139..5b1c042 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -707,6 +707,15 @@ extern int dev_hard_start_xmit(struct s extern void dev_init(void); +#ifdef CONFIG_NETCHANNEL +extern int netchannel_recv(struct sk_buff *skb); +#else +static int netchannel_recv(struct sk_buff *skb) +{ + return -1; +} +#endif + extern int netdev_budget; /* Called by rtnetlink.c:rtnl_unlock() */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 85577a4..ff2bdf9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -338,6 +338,18 @@ static inline struct sk_buff *alloc_skb( return __alloc_skb(size, priority, 0); } +#ifdef CONFIG_NETCHANNEL +struct unetchannel; +extern struct sk_buff *netchannel_alloc(struct unetchannel *unc, unsigned int header_size, + unsigned int total_size, gfp_t gfp_mask); +#else +static struct sk_buff *netchannel_alloc(void *unc, unsigned int header_size, + unsigned int total_size, gfp_t gfp_mask) +{ + return NULL; +} +#endif + static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1912c6c..a42e608 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -605,4 +605,6 @@ asmlinkage long sys_getcpu(unsigned __us int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage long sys_netchannel_control(void __user *arg); + #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0e53314..275e3e8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -134,9 +134,12 @@ cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); + cond_syscall(compat_sys_move_pages); /* block-layer dependent */ cond_syscall(sys_bdflush); cond_syscall(sys_ioprio_set); cond_syscall(sys_ioprio_get); + +cond_syscall(sys_netchannel_control); diff --git a/net/Kconfig b/net/Kconfig index a81aca4..db801d1 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -66,6 +66,14 @@ source "net/ipv6/Kconfig" endif # if INET +config NETCHANNEL + bool "Network channels" + ---help--- + Network channels are peer-to-peer abstraction, which allows to create + high performance communications. + Main advantages are unified address cache, protocol processing moved + to userspace, receiving zero-copy support and other interesting features. + config NETWORK_SECMARK bool "Security Marking" help diff --git a/net/core/Makefile b/net/core/Makefile index 1195680..442b83f 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,5 +16,6 @@ obj-$(CONFIG_NET_DIVERT) += dv.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_WIRELESS_EXT) += wireless.o obj-$(CONFIG_NETPOLL) += netpoll.o +obj-$(CONFIG_NETCHANNEL) += netchannel.o obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o diff --git a/net/core/dev.c b/net/core/dev.c index 81c426a..33ba1ff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1808,6 +1808,10 @@ #endif } } + ret = netchannel_recv(skb); + if (!ret) + goto out; + #ifdef CONFIG_NET_CLS_ACT if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); diff --git a/net/core/netchannel.c b/net/core/netchannel.c new file mode 100644 index 0000000..2c5fe34 --- /dev/null +++ b/net/core/netchannel.c @@ -0,0 +1,897 @@ +/* + * netchannel.c + * + * 2006 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]> + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/skbuff.h> +#include <linux/highmem.h> +#include <linux/workqueue.h> +#include <linux/rbtree.h> +#include <linux/netfilter.h> +#include <linux/netchannel.h> + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/udp.h> + +#include <net/route.h> +#include <net/ip.h> + +#include <linux/netdevice.h> + +#include <asm/uaccess.h> + +static struct rb_root netchannel_root = RB_ROOT; +static kmem_cache_t *netchannel_cache; +static DEFINE_MUTEX(netchannel_tree_lock); + +static int netchannel_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + /* So original magic... */ + return get_sb_pseudo(fs_type, "netchannel", NULL, 0xabcdef, mnt); +} + +static struct file_system_type netchannel_fs = { + .name = "netchannel", + .get_sb = netchannel_get_sb, + .kill_sb = kill_anon_super, +}; + +static struct vfsmount *netchannel_mnt; + +static inline int netchannel_compare(struct unetchannel *unc1, struct unetchannel *unc2) +{ + u32 ports1, ports2; + u64 addrs1, addrs2; + + ports1 = unc1->fport; + ports1 = (ports1 << 16) | unc1->lport; + ports2 = unc2->fport; + ports2 = (ports2 << 16) | unc2->lport; + + addrs1 = unc1->faddr; + addrs1 = (addrs1 << 16) | unc1->laddr; + addrs2 = unc2->faddr; + addrs2 = (addrs2 << 16) | unc2->laddr; + + if (unc1->proto > unc2->proto) + return 1; + if (unc1->proto < unc2->proto) + return -1; + + if (ports1 > ports2) + return 1; + if (ports1 < ports2) + return -1; + + if (addrs1 > addrs2) + return 1; + if (addrs1 < addrs2) + return -1; + + return 0; +} + +static struct netchannel *netchannel_search(struct unetchannel *unc) +{ + struct rb_node *node = netchannel_root.rb_node; + struct netchannel *nc, *ret = NULL; + int cmp; + + while (node) { + nc = rb_entry(node, struct netchannel, netchannel_node); + + cmp = netchannel_compare(&nc->unc, unc); + if (cmp > 0) + node = node->rb_right; + else if (cmp < 0) + node = node->rb_left; + else { + ret = nc; + break; + } + } + + return ret; +} + +static inline void netchannel_dump_info(struct netchannel *nc, char *prefix, int err) +{ + printk(KERN_NOTICE "netchannel: %s %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u, " + "proto: %u, copy: %u, state: %u, order: %u [%u], hit: %lu, err: %d, qlen: %lu.\n", + prefix, NIPQUAD(nc->unc.laddr), ntohs(nc->unc.lport), NIPQUAD(nc->unc.faddr), ntohs(nc->unc.fport), + nc->unc.proto, nc->unc.copy, nc->unc.state, nc->unc.memory_limit_order, + (1<<nc->unc.memory_limit_order), nc->hit, err, nc->qlen); +} + +static void netchannel_free_rcu(struct rcu_head *rcu) +{ + struct netchannel *nc = container_of(rcu, struct netchannel, rcu_head); + + skb_queue_purge(&nc->recv_queue); + dst_release(nc->dst); + + netchannel_dump_info(nc, "cleanup", 0); + kmem_cache_free(netchannel_cache, nc); +} + +static inline void netchannel_get(struct netchannel *nc) +{ + atomic_inc(&nc->refcnt); +} + +static inline void netchannel_put(struct netchannel *nc) +{ + if (atomic_dec_and_test(&nc->refcnt)) { + netchannel_dump_info(nc, "put", 0); + call_rcu(&nc->rcu_head, &netchannel_free_rcu); + } +} + +static int netchannel_ip_route_output_flow(struct rtable **rp, struct flowi *flp, int flags) +{ + int err; + + err = __ip_route_output_key(rp, flp); + if (err) + return err; + + if (flp->proto) { + if (!flp->fl4_src) + flp->fl4_src = (*rp)->rt_src; + if (!flp->fl4_dst) + flp->fl4_dst = (*rp)->rt_dst; + } + + return 0; +} + +static struct dst_entry *netchannel_route_get_raw(struct netchannel *nc) +{ + struct rtable *rt; + struct flowi fl = { .oif = 0, + .nl_u = { .ip4_u = + { .daddr = nc->unc.faddr, + .saddr = nc->unc.laddr, + .tos = 0 } }, + .proto = nc->unc.proto, + .uli_u = { .ports = + { .sport = nc->unc.lport, + .dport = nc->unc.fport } } }; + + if (netchannel_ip_route_output_flow(&rt, &fl, 0)) + goto no_route; + return dst_clone(&rt->u.dst); + +no_route: + return NULL; +} + +static struct dst_entry *netchannel_route_get(struct netchannel *nc) +{ + if (nc->dst && nc->dst->obsolete && nc->dst->ops->check(nc->dst, 0) == NULL) { + dst_release(nc->dst); + nc->dst = netchannel_route_get_raw(nc); + if (!nc->dst) + return NULL; + } + return dst_clone(nc->dst); +} + +static int netchannel_convert_skb_ipv6(struct sk_buff *skb, struct unetchannel *unc) +{ + /* + * Hash IP addresses into src/dst. Setup TCP/UDP ports. + * Not supported yet. + */ + return -1; +} + +static int netchannel_convert_skb_ipv4(struct sk_buff *skb, struct unetchannel *unc) +{ + struct iphdr *iph; + u32 len; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + + iph = skb->nh.iph; + + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto inhdr_error; + + iph = skb->nh.iph; + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + goto inhdr_error; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < (iph->ihl*4)) + goto inhdr_error; + + if (pskb_trim_rcsum(skb, len)) + goto inhdr_error; + + unc->faddr = iph->saddr; + unc->laddr = iph->daddr; + unc->proto = iph->protocol; + + len = skb->len; + + skb->h.raw = skb->nh.raw + iph->ihl*4; + + switch (unc->proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + unc->fport = ((u16 *)skb->h.raw)[0]; + unc->lport = ((u16 *)skb->h.raw)[1]; + break; + default: + goto inhdr_error; + } + + return 0; + +inhdr_error: + return -1; +} + +static int netchannel_convert_skb(struct sk_buff *skb, struct unetchannel *unc) +{ + if (skb->pkt_type == PACKET_OTHERHOST) + return -1; + + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + return netchannel_convert_skb_ipv4(skb, unc); + case ETH_P_IPV6: + return netchannel_convert_skb_ipv6(skb, unc); + default: + return -1; + } +} + +/* + * By design netchannels allow to "allocate" data + * not only from SLAB cache, but get it from mapped area + * or from VFS cache (requires process' context or preallocation). + */ +struct sk_buff *netchannel_alloc(struct unetchannel *unc, unsigned int header_size, + unsigned int total_size, gfp_t gfp_mask) +{ + struct netchannel *nc; + int err; + struct sk_buff *skb = NULL; + unsigned int size, pnum, i; + + skb = alloc_skb(header_size, gfp_mask); + if (!skb) + return NULL; + + rcu_read_lock(); + nc = netchannel_search(unc); + if (!nc) { + err = -ENODEV; + goto err_out_free_skb; + } + + if (!nc->nc_alloc_page || !nc->nc_free_page) { + err = -EINVAL; + goto err_out_free_skb; + } + + size = total_size - header_size; + pnum = PAGE_ALIGN(size) >> PAGE_SHIFT; + + for (i=0; i<pnum; ++i) { + unsigned int cs = min_t(unsigned int, PAGE_SIZE, size); + struct page *page; + + page = nc->nc_alloc_page(cs); + if (!page) + break; + + skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, page, 0, cs); + + skb->len += cs; + skb->data_len += cs; + skb->truesize += cs; + + size -= cs; + } + + if (i < pnum) { + pnum = i; + err = -ENOMEM; + goto err_out_free_frags; + } + + rcu_read_unlock(); + + return skb; + +err_out_free_frags: + for (i=0; i<pnum; ++i) { + unsigned int cs = skb_shinfo(skb)->frags[i].size; + struct page *page = skb_shinfo(skb)->frags[i].page; + + nc->nc_free_page(page); + + skb->len -= cs; + skb->data_len -= cs; + skb->truesize -= cs; + } + +err_out_free_skb: + rcu_read_unlock(); + kfree_skb(skb); + return NULL; +} + +int netchannel_recv(struct sk_buff *skb) +{ + struct netchannel *nc; + struct unetchannel unc; + int err; + + rcu_read_lock(); + + err = netchannel_convert_skb(skb, &unc); + if (err) + goto unlock; + + nc = netchannel_search(&unc); + if (!nc) { + err = -ENODEV; + goto unlock; + } + + nc->hit++; +#if 1 + if (nc->qlen + skb->len > (1 << nc->unc.memory_limit_order)) { + kfree_skb(skb); + err = 0; + goto unlock; + } +#endif + nc->qlen += skb->len; + skb_queue_tail(&nc->recv_queue, skb); + wake_up(&nc->wait); + +unlock: + rcu_read_unlock(); + + return err; +} + +static int netchannel_wait_for_packet(struct netchannel *nc, long *timeo_p) +{ + int error = 0; + DEFINE_WAIT(wait); + + prepare_to_wait_exclusive(&nc->wait, &wait, TASK_INTERRUPTIBLE); + + if (skb_queue_empty(&nc->recv_queue)) { + if (signal_pending(current)) + goto interrupted; + + *timeo_p = schedule_timeout(*timeo_p); + } +out: + finish_wait(&nc->wait, &wait); + return error; +interrupted: + error = (*timeo_p == MAX_SCHEDULE_TIMEOUT) ? -ERESTARTSYS : -EINTR; + goto out; +} + +struct sk_buff *netchannel_get_skb(struct netchannel *nc, unsigned int *timeout, int *error) +{ + struct sk_buff *skb = NULL; + long tm = *timeout; + + *error = 0; + + while (1) { + skb = skb_dequeue(&nc->recv_queue); + if (skb) + break; + + if (*timeout) { + *error = netchannel_wait_for_packet(nc, &tm); + if (*error) { + *timeout = tm; + break; + } + tm = *timeout; + } else { + *error = -EAGAIN; + break; + } + } + + if (!skb) + skb = skb_dequeue(&nc->recv_queue); + + if (skb) + nc->qlen -= skb->len; + + return skb; +} + +static int netchannel_copy_from_user(struct netchannel *nc, unsigned int *timeout, __u16 len, __u16 header_len, void __user *arg) +{ + struct sk_buff *skb; + int err = -EINVAL; + struct dst_entry *dst; + struct net_device *dev; + + if (header_len > len) + goto err_out_exit; + + dst = netchannel_route_get(nc); + if (!dst) { + err = -EHOSTUNREACH; + goto err_out_exit; + } + + dev = dst->dev; + + skb = alloc_skb(len+LL_RESERVED_SPACE(dev), GFP_KERNEL); + if (!skb) { + err = -ENOMEM; + goto err_out_route_put; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + err = skb_add_data(skb, arg, len); + if (err) + goto err_out_free; + + skb->ip_summed = CHECKSUM_NONE; + + skb->nh.raw = skb->data; + skb->h.raw = skb->data + header_len; + skb->protocol = htons(ETH_P_IP); + skb->dst = dst; + skb->dev = dst->dev; + +#if defined(NETCHANNEL_DEBUG) + if (nc->unc.proto == IPPROTO_TCP) { + struct tcphdr *th = skb->h.th; + + printk("S %u.%u.%u.%u:%u <-> %u.%u.%u.%u:%u : seq: %u, ack: %u, win: %u, doff: %u, " + "s: %u, a: %u, p: %u, r: %u, f: %u, len: %u, skb: %p, csum: %04x.\n", + NIPQUAD(nc->unc.laddr), ntohs(nc->unc.lport), + NIPQUAD(nc->unc.faddr), ntohs(nc->unc.fport), + ntohl(th->seq), ntohl(th->ack_seq), ntohs(th->window), th->doff, + th->syn, th->ack, th->psh, th->rst, th->fin, + skb->len, skb, th->check); + } +#endif + + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); + +err_out_free: + kfree_skb(skb); + dst = NULL; +err_out_route_put: + dst_release(dst); +err_out_exit: + return err; +} + +static int netchannel_copy_to_user(struct netchannel *nc, unsigned int *timeout, __u16 *len, void __user *arg) +{ + unsigned int copied; + struct sk_buff *skb; + struct iovec to; + int err; + + skb = netchannel_get_skb(nc, timeout, &err); + if (!skb) + return err; + + to.iov_base = arg; + to.iov_len = *len; + + copied = skb->len; + if (copied > *len) + copied = *len; + + err = skb_copy_datagram_iovec(skb, 0, &to, copied); + + *len = (err == 0)?copied:0; + + kfree_skb(skb); + + return err; +} + +static int netchannel_copy_user_setup(struct netchannel *nc) +{ + nc->nc_recv_data = &netchannel_copy_to_user; + nc->nc_send_data = &netchannel_copy_from_user; + + return 0; +} + +static int netchannel_setup(struct netchannel *nc) +{ + int ret = 0; + + if (nc->unc.memory_limit_order > NETCHANNEL_MAX_ORDER) + nc->unc.memory_limit_order = NETCHANNEL_MAX_ORDER; + + if (nc->unc.memory_limit_order < NETCHANNEL_MIN_ORDER) + nc->unc.memory_limit_order = NETCHANNEL_MIN_ORDER; + + switch (nc->unc.copy) { + case NETCHANNEL_COPY_USER: + ret = netchannel_copy_user_setup(nc); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static void netchannel_work(void *data) +{ + struct netchannel *nc = data; + + netchannel_dump_info(nc, "work", 0); + schedule_delayed_work(&nc->work, msecs_to_jiffies(1000*nc->unc.init_stat_work)); +} + +static void netchannel_tree_remove(struct netchannel *nc) +{ + rb_erase(&nc->netchannel_node, &netchannel_root); +} + +static int netchannel_tree_add(struct netchannel *new) +{ + struct rb_node **p = &netchannel_root.rb_node, *parent = NULL; + struct netchannel *nc; + int err = 0, cmp = 0; + + while (*p) { + parent = *p; + nc = rb_entry(parent, struct netchannel, netchannel_node); + + cmp = netchannel_compare(&nc->unc, &new->unc); + if (cmp > 0) + p = &parent->rb_right; + else if (cmp < 0) + p = &parent->rb_left; + else { + err = -EEXIST; + break; + } + } + if (likely(!err)) { + rb_link_node(&new->netchannel_node, parent, p); + rb_insert_color(&new->netchannel_node, &netchannel_root); + } + + return err; +} + +ssize_t netchannel_read(struct file *file, char __user *buf, size_t size, loff_t *off) +{ + struct netchannel *nc = file->private_data; + unsigned int timeout = 0; + int ret; + + ret = nc->nc_recv_data(nc, &timeout, (__u16 *)&size, buf); + if (ret < 0) + return ret; + return size; +} + +ssize_t netchannel_write(struct file *file, const char __user *buf, size_t size, loff_t *off) +{ + return -ENOTSUPP; +} + +unsigned int netchannel_poll(struct file *file, struct poll_table_struct *wait) +{ + struct netchannel *nc = file->private_data; + unsigned int mask = 0; + + poll_wait(file, &nc->wait, wait); + if (!skb_queue_empty(&nc->recv_queue)) + mask |= POLLIN; + + return mask; +} + +static int netchannel_release(struct inode *inode, struct file *file) +{ + struct netchannel *nc = file->private_data; + + mutex_lock(&netchannel_tree_lock); + netchannel_tree_remove(nc); + mutex_unlock(&netchannel_tree_lock); + + if (nc->unc.init_stat_work) { + cancel_rearming_delayed_work(&nc->work); + flush_scheduled_work(); + } + + netchannel_dump_info(nc, "remove", 0); + netchannel_put(nc); + + return 0; +} + +static struct file_operations netchannel_fops = { + .release = netchannel_release, + .read = netchannel_read, + .poll = netchannel_poll, + .write = netchannel_write, + .owner = THIS_MODULE, +}; + +static struct netchannel *netchannel_search_control(struct unetchannel_control *ctl) +{ + struct netchannel *nc; + + if (ctl->fd) { + struct file *file; + int fput_needed; + + file = fget_light(ctl->fd, &fput_needed); + if (!file) + return NULL; + + nc = file->private_data; + + fput_light(file, fput_needed); + + if (!nc) + return NULL; + } else { + mutex_lock(&netchannel_tree_lock); + nc = netchannel_search(&ctl->unc); + if (!nc) + goto err_out_unlock; + + netchannel_get(nc); + mutex_unlock(&netchannel_tree_lock); + } + + return nc; + +err_out_unlock: + mutex_unlock(&netchannel_tree_lock); + return NULL; +} + +static int netchannel_send_data(struct unetchannel_control *ctl, void __user *data) +{ + int ret; + struct netchannel *nc; + + nc = netchannel_search_control(ctl); + if (!nc) + return -ENODEV; + + ret = nc->nc_send_data(nc, &ctl->timeout, ctl->len, ctl->header_len, data); + + if (!ctl->fd) + netchannel_put(nc); + return ret; +} + +static int netchannel_recv_data(struct unetchannel_control *ctl, void __user *data) +{ + int ret; + struct netchannel *nc; + + nc = netchannel_search_control(ctl); + if (!nc) + return -ENODEV; + + ret = nc->nc_recv_data(nc, &ctl->timeout, &ctl->len, data); + + if (!ctl->fd) + netchannel_put(nc); + return ret; +} + +static int netchannel_bind_fd(struct netchannel *nc) +{ + struct file *file; + int fd, ret; + + fd = get_unused_fd(); + if (fd < 0) + return fd; + + file = get_empty_filp(); + if (!file) { + ret = -ENFILE; + goto out_put_fd; + } + + netchannel_get(nc); + + file->f_op = &netchannel_fops; + file->f_vfsmnt = mntget(netchannel_mnt); + file->f_dentry = dget(netchannel_mnt->mnt_root); + file->f_mapping = file->f_dentry->d_inode->i_mapping; + file->f_mode = FMODE_READ; + file->f_flags = O_RDONLY; + file->private_data = nc; + + fd_install(fd, file); + + return fd; + +out_put_fd: + put_unused_fd(fd); + return ret; +} + +static int netchannel_create(struct unetchannel *unc) +{ + struct netchannel *nc; + int err = -ENOMEM, fd; + + nc = kmem_cache_alloc(netchannel_cache, GFP_KERNEL); + if (!nc) + return -ENOMEM; + + memset(nc, 0, sizeof(struct netchannel)); + + nc->hit = 0; + skb_queue_head_init(&nc->recv_queue); + init_waitqueue_head(&nc->wait); + atomic_set(&nc->refcnt, 0); + memcpy(&nc->unc, unc, sizeof(struct unetchannel)); + + err = netchannel_setup(nc); + if (err) + goto err_out_free; + + nc->dst = netchannel_route_get_raw(nc); + if (!nc->dst) { + err = -ENODEV; + goto err_out_free; + } + + mutex_lock(&netchannel_tree_lock); + err = netchannel_tree_add(nc); + if (err) + goto err_out_unlock; + + fd = netchannel_bind_fd(nc); + if (fd < 0) { + err = fd; + goto err_out_unlock; + } + + mutex_unlock(&netchannel_tree_lock); + + netchannel_dump_info(nc, "create", err); + + if (nc->unc.init_stat_work) { + INIT_WORK(&nc->work, netchannel_work, nc); + schedule_delayed_work(&nc->work, msecs_to_jiffies(1000*nc->unc.init_stat_work)); + } + + return fd; + +err_out_unlock: + mutex_unlock(&netchannel_tree_lock); + dst_release(nc->dst); +err_out_free: + kmem_cache_free(netchannel_cache, nc); + + return err; +} + +asmlinkage long sys_netchannel_control(void __user *arg) +{ + struct unetchannel_control ctl; + int ret; + + if (copy_from_user(&ctl, arg, sizeof(struct unetchannel_control))) + return -EFAULT; + + switch (ctl.cmd) { + case NETCHANNEL_CREATE: + ret = netchannel_create(&ctl.unc); + break; + case NETCHANNEL_RECV: + ret = netchannel_recv_data(&ctl, arg + sizeof(struct unetchannel_control)); + break; + case NETCHANNEL_SEND: + ret = netchannel_send_data(&ctl, arg + sizeof(struct unetchannel_control)); + break; + default: + ret = -EINVAL; + break; + } + if (copy_to_user(arg, &ctl, sizeof(struct unetchannel_control))) + return -EFAULT; + + return ret; +} + + + +static int __init netchannel_init(void) +{ + int err; + + err = register_filesystem(&netchannel_fs); + if (err) { + printk(KERN_ERR "Failed to register netchannel fs, err: %d.\n", err); + return err; + } + + netchannel_mnt = kern_mount(&netchannel_fs); + if (IS_ERR(netchannel_mnt)) { + printk(KERN_ERR "Failed to mount netchannel fs, err: %ld.\n", PTR_ERR(netchannel_mnt)); + err = PTR_ERR(netchannel_mnt); + goto err_out_unregister; + } + + netchannel_cache = kmem_cache_create("netchannel", sizeof(struct netchannel), 0, 0, + NULL, NULL); + if (!netchannel_cache) + goto err_out_umount; + + return 0; + +err_out_umount: + mntput(netchannel_mnt); +err_out_unregister: + unregister_filesystem(&netchannel_fs); + printk(KERN_NOTICE "netchannel: failed to initialize tree.\n"); + return err; +} + +static void __exit netchannel_exit(void) +{ + kmem_cache_destroy(netchannel_cache); + mntput(netchannel_mnt); + unregister_filesystem(&netchannel_fs); +} + +module_init(netchannel_init); +module_exit(netchannel_exit); -- Evgeniy Polyakov
atcp_speed.png
Description: PNG image