Hey guys... I've been working with Rusty on a VJ Channel implementation. Noting Dave's recent release of his implementation, we thought we'd better get this "out there" so we can do some early comparison/combining and come up with the best possible implementation.
There are three patches in total: 1) vj_core.patch - core files for VJ to userspace 2) vj_udp.patch - badly hacked up UDP receive implementation - basically just to test what logic may be like! 3) vj_ne2k.patch - modified NE2K and 8390 used for testing on QEMU Notes: * channels can have global or local buffers (local for userspace. Could be used directly by intelligent NIC) * UDP receive breaks real UDP - doesn't talk anything except VJ Channels anymore. Needs integration with normal sources. * Userspace test app (below) uses VJ protocol family to mmap space for local buffers, if it receives buffers in kernel space sends a request for that buffer to be copied to local buffer. * Default channel converts to skb and feeds through normal receive path. TODO: * send not yet implemented * integrate non vj * LOTS of fixmes Cheers, Kelly Test userspace app: /* Van Jacobson net channels implementation for Linux Copyright (C) 2006 Kelly Daly <[EMAIL PROTECTED]> IBM Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <string.h> #include <unistd.h> #include <sys/types.h> #include <sys/socket.h> #include <sys/mman.h> #include <sys/poll.h> #include <netinet/in.h> #include "linux-2.6.16/include/linux/types.h" #include "linux-2.6.16/include/linux/vjchan.h" //flowid #define SADDR 0 #define DADDR 0 #define SPORT 0 #define DPORT 60000 #define IFINDEX 0 #define PF_VJCHAN 27 static struct vj_buffer *get_buffer(struct vj_channel_ring *ring, int desc_num) { printf("desc_num %i\n", desc_num); return (void *)ring + (desc_num + 1) * getpagesize(); } /* return the next buffer, but do not move on */ static struct vj_buffer *vj_peek_next_buffer(struct vj_channel_ring *ring) { if (ring->c.head == ring->p.tail) return NULL; return get_buffer(ring, ring->q[ring->c.head]); } /* move on to next buffer */ static void vj_done_with_buffer(struct vj_channel_ring *ring) { ring->c.head = (ring->c.head+1)%VJ_NET_CHANNEL_ENTRIES; printf("done_with_buffer\n\n"); } int main(int argc, char *argv[]) { int sk, cls, bnd, pll; void * mmapped; struct vj_flowid flowid; struct vj_channel_ring *ring; struct vj_buffer *buf; struct pollfd pfd; printf("\nstart of vjchannel socket test app\n"); sk = socket(PF_VJCHAN, SOCK_DGRAM, IPPROTO_UDP); if (sk == -1) { perror("Unable to open socket!"); return -1; } printf("socket open with ret code %i\n\n", sk); //create flowid!!! flowid.saddr = SADDR; flowid.daddr = DADDR; flowid.sport = SPORT; flowid.dport = htons(DPORT); flowid.ifindex = IFINDEX; flowid.proto = IPPROTO_UDP; printf("flowid created\n"); bnd = bind(sk, (struct sockaddr *)&flowid, sizeof(struct vj_flowid)); if (bnd == -1) { perror("Unable to bind socket!"); return -1; } printf("socket bound with ret code %i\n\n", bnd); ring = mmap(0, (getpagesize() * (VJ_NET_CHANNEL_ENTRIES+1)), PROT_READ|PROT_WRITE, MAP_SHARED, sk, 0); if (ring == MAP_FAILED) { perror ("Unable to mmap socket!"); return -1; } printf("socket mmapped to address %lu\n\n", (unsigned long)mmapped); pfd.fd = sk; pfd.events = POLLIN; for (;;) { pll = poll(&pfd, 1, -1); if (pll < 0) { perror("polling failed!"); return -1; } //consume buf = vj_peek_next_buffer(ring); printf("buf %p\n", buf); //print data, not headers printf(" Buffer Length = %i\n", buf->data_len); printf(" Header Length = %i\n", buf->header_len); printf(" Buffer Data: '%.*s'\n", buf->data_len - 28, buf->data + buf->header_len + 28); vj_done_with_buffer(ring); } cls = close(sk); if (cls != 0) { perror("Unable to close socket!"); return -2; } printf("socket closed with ret code %i\n\n", cls); return 0; } ------------------------- Signed-off-by: Kelly Daly <[EMAIL PROTECTED]> Basic infrastructure for Van Jacobson net channels: lockless ringbuffer for buffer transport. Entries in ring buffer are descriptors for global or local buffers: ring and local buffers are mmapped into userspace. Channels are registered with the core by flowid, and a thread services the default channel for any non-matching packets. Drivers get (global) buffers from vj_get_buffer, and dispatch them through vj_netif_rx. As userspace mmap cannot reach global buffers, select() copies global buffers into local buffers if required. diff -r 47031a1f466c linux-2.6.16/include/linux/socket.h --- linux-2.6.16/include/linux/socket.h Thu Mar 23 06:32:12 2006 +++ linux-2.6.16/include/linux/socket.h Mon Apr 24 19:50:46 2006 @@ -186,6 +187,7 @@ #define AF_PPPOX 24 /* PPPoX sockets */ #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ +#define AF_VJCHAN 27 /* VJ Channel */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ #define AF_MAX 32 /* For now.. */ @@ -219,7 +221,8 @@ #define PF_PPPOX AF_PPPOX #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC +#define PF_VJCHAN AF_VJCHAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH #define PF_MAX AF_MAX diff -r 47031a1f466c linux-2.6.16/net/Kconfig --- linux-2.6.16/net/Kconfig Thu Mar 23 06:32:12 2006 +++ linux-2.6.16/net/Kconfig Mon Apr 24 19:50:46 2006 @@ -65,6 +65,12 @@ source "net/ipv6/Kconfig" endif # if INET + +config VJCHAN + bool "Van Jacobson Net Channel Support (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + This adds a userspace-accessible packet receive interface. Say N. menuconfig NETFILTER bool "Network packet filtering (replaces ipchains)" diff -r 47031a1f466c linux-2.6.16/net/Makefile --- linux-2.6.16/net/Makefile Thu Mar 23 06:32:12 2006 +++ linux-2.6.16/net/Makefile Mon Apr 24 19:50:46 2006 @@ -46,6 +46,7 @@ obj-$(CONFIG_IP_SCTP) += sctp/ obj-$(CONFIG_IEEE80211) += ieee80211/ obj-$(CONFIG_TIPC) += tipc/ +obj-$(CONFIG_VJCHAN) += vjchan/ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_SYSCTL) += sysctl_net.o diff -r 47031a1f466c linux-2.6.16/include/linux/vjchan.h --- /dev/null Thu Mar 23 06:32:12 2006 +++ linux-2.6.16/include/linux/vjchan.h Mon Apr 24 19:50:46 2006 @@ -0,0 +1,79 @@ +#ifndef _LINUX_VJCHAN_H +#define _LINUX_VJCHAN_H + +/* num entries in channel q: set so consumer is at offset 1024. */ +#define VJ_NET_CHANNEL_ENTRIES 254 +/* identifies non-local buffers (ie. need kernel to copy to a local) */ +#define VJ_HIGH_BIT 0x80000000 + +struct vj_producer { + __u16 tail; /* next element to add */ + __u8 wakecnt; /* do wakeup if != consumer wakecnt */ + __u8 pad; + __u16 old_head; /* last cleared buffer posn +1 */ + __u16 pad2; +}; + +struct vj_consumer { + __u16 head; /* next element to remove */ + __u8 wakecnt; /* increment to request wakeup */ +}; + +/* mmap returns one of these, followed by 254 pages with a buffer each */ +struct vj_channel_ring { + struct vj_producer p; /* producer's header */ + __u32 q[VJ_NET_CHANNEL_ENTRIES]; + struct vj_consumer c; /* consumer's header */ +}; + +struct vj_buffer { + __u32 data_len; /* length of actual data in buffer */ + __u32 header_len; /* offset eth + ip header (true for now) */ + __u32 ifindex; /* interface the packet came in on. */ + char data[0]; +}; + +/* Currently assumed IPv4 */ +struct vj_flowid +{ + __u32 saddr, daddr; + __u16 sport, dport; + __u32 ifindex; + __u16 proto; +}; + +#ifdef __KERNEL__ +struct net_device; +struct sk_buff; + +struct vj_descriptor { + unsigned long address; /* address of net_channel_buffer */ + unsigned long buffer_len; /* max length including header */ +}; + +/* Everything about a vj_channel */ +struct vj_channel +{ + struct vj_channel_ring *ring; + wait_queue_head_t wq; + struct list_head list; + struct vj_flowid flowid; + int num_local_buffers; + struct vj_descriptor *descs; + unsigned long * used_descs; +}; + +void vj_inc_wakecnt(struct vj_channel *chan); +struct vj_buffer *vj_get_buffer(int *desc_num); +void vj_netif_rx(struct vj_buffer *buffer, int desc_num, unsigned short proto); +int vj_xmit(struct sk_buff *skb, struct net_device *dev); +struct vj_channel *vj_alloc_chan(int num_buffers); +void vj_register_chan(struct vj_channel *chan, const struct vj_flowid *flowid); +void vj_unregister_chan(struct vj_channel *chan); +void vj_free_chan(struct vj_channel *chan); +struct vj_buffer *vj_peek_next_buffer(struct vj_channel *chan); +void vj_done_with_buffer(struct vj_channel *chan); +unsigned short eth_vj_type_trans(struct vj_buffer *buffer); +int vj_need_local_buffer(struct vj_channel *chan); +#endif +#endif /* _LINUX_VJCHAN_H */ diff -r 47031a1f466c linux-2.6.16/net/vjchan/Makefile --- /dev/null Thu Mar 23 06:32:12 2006 +++ linux-2.6.16/net/vjchan/Makefile Mon Apr 24 19:50:46 2006 @@ -0,0 +1,3 @@ +#obj-m += vjtest.o +obj-y += vjnet.o +obj-y += af_vjchan.o diff -r 47031a1f466c linux-2.6.16/net/vjchan/af_vjchan.c --- /dev/null Thu Mar 23 06:32:12 2006 +++ linux-2.6.16/net/vjchan/af_vjchan.c Mon Apr 24 19:50:46 2006 @@ -0,0 +1,198 @@ +/* Van Jacobson net channels implementation for Linux + Copyright (C) 2006 Kelly Daly <[EMAIL PROTECTED]> IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/socket.h> +#include <linux/vjchan.h> +#include <net/sock.h> + +struct vjchan_sock +{ + struct sock sk; + struct vj_channel *chan; + int vj_reg_flag; +}; + +static inline struct vjchan_sock *vj_sk(struct sock *sk) +{ + return (struct vjchan_sock *)sk; +} + +static struct proto vjchan_proto = { + .name = "VJCHAN", + .owner = THIS_MODULE, + .obj_size = sizeof(struct vjchan_sock), +}; + +int vjchan_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + sock_orphan(sk); + sock->sk = NULL; + sock_put(sk); + return 0; +} + +int vjchan_bind(struct socket *sock, struct sockaddr *addr, int sockaddr_len) +{ + struct sock *sk = sock->sk; + struct vjchan_sock *vjsk; + struct vj_flowid *flowid = (struct vj_flowid *)addr; + + /* FIXME: avoid clashing with normal sockets, replace zeroes. */ + vjsk = vj_sk(sk); + vj_register_chan(vjsk->chan, flowid); + vjsk->vj_reg_flag = 1; + + return 0; +} + +int vjchan_getname(struct socket *sock, struct sockaddr *addr, + int *sockaddr_len, int peer) +{ + /* FIXME: Implement */ + return 0; +} + +unsigned int vjchan_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + struct sock *sk = sock->sk; + struct vj_channel *chan = vj_sk(sk)->chan; + + poll_wait(file, &chan->wq, wait); + vj_inc_wakecnt(chan); + + if (vj_peek_next_buffer(chan) && vj_need_local_buffer(chan) == 0) + return POLLIN | POLLRDNORM; + + return 0; +} + +/* We map the ring first, then one page per buffer. */ +int vjchan_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + struct vj_channel *chan = vj_sk(sk)->chan; + int i, vip; + unsigned long pos; + + if (vma->vm_end - vma->vm_start != + (1 + chan->num_local_buffers)*PAGE_SIZE) + return -EINVAL; + + pos = vma->vm_start; + vip = vm_insert_page(vma, pos, virt_to_page(chan->ring)); + pos += PAGE_SIZE; + for (i = 0; i < chan->num_local_buffers; i++) { + vip = vm_insert_page(vma, pos, virt_to_page(chan->descs[i].address)); + pos += PAGE_SIZE; + } + return 0; +} + +const struct proto_ops vjchan_ops = { + .family = PF_VJCHAN, + .owner = THIS_MODULE, + .release = vjchan_release, + .bind = vjchan_bind, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = vjchan_getname, + .poll = vjchan_poll, + .ioctl = sock_no_ioctl, + .shutdown = sock_no_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = vjchan_mmap, + .sendpage = sock_no_sendpage +}; + +static void vjchan_destruct(struct sock *sk) +{ + struct vjchan_sock *vjsk; + + vjsk = vj_sk(sk); + if (vjsk->vj_reg_flag) { + vj_unregister_chan(vjsk->chan); + vjsk->vj_reg_flag = 0; + } + vj_free_chan(vjsk->chan); + +} + +static int vjchan_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct vjchan_sock *vjsk; + int err; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_DGRAM + && sock->type != SOCK_RAW + && sock->type != SOCK_PACKET) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + + err = -ENOBUFS; + sk = sk_alloc(PF_VJCHAN, GFP_KERNEL, &vjchan_proto, 1); + if (sk == NULL) + goto out; + + sock->ops = &vjchan_ops; + + sock_init_data(sock, sk); + sk->sk_family = PF_VJCHAN; + sk->sk_destruct = vjchan_destruct; + + vjsk = vj_sk(sk); + vjsk->chan = vj_alloc_chan(VJ_NET_CHANNEL_ENTRIES); + vjsk->vj_reg_flag = 0; + if (!vjsk->chan) + return -ENOMEM; + return 0; +out: + return err; +} + +static struct net_proto_family vjchan_family_ops = { + .family = PF_VJCHAN, + .create = vjchan_create, + .owner = THIS_MODULE, +}; + +static void __exit vjchan_exit(void) +{ + sock_unregister(PF_VJCHAN); +} + +static int __init vjchan_init(void) +{ + return sock_register(&vjchan_family_ops); +} + +module_init(vjchan_init); +module_exit(vjchan_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_VJCHAN); diff -r 47031a1f466c linux-2.6.16/net/vjchan/vjnet.c --- /dev/null Thu Mar 23 06:32:12 2006 +++ linux-2.6.16/net/vjchan/vjnet.c Mon Apr 24 19:50:46 2006 @@ -0,0 +1,550 @@ +/* Van Jacobson net channels implementation for Linux + Copyright (C) 2006 Kelly Daly <[EMAIL PROTECTED]> IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/etherdevice.h> +#include <linux/spinlock.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/vjchan.h> + +#define BUFFER_DATA_LEN 2048 +#define NUM_GLOBAL_DESCRIPTORS 1024 + +/* All our channels. FIXME: Lockless funky hash structure please... */ +static LIST_HEAD(channels); +static spinlock_t chan_lock = SPIN_LOCK_UNLOCKED; + +/* Default channel, also holds global buffers (userspace-mapped + * channels have local buffers, which they prefer to use). */ +static struct vj_channel *default_chan; + +/* need to increment for wake in udp.c wait_for_vj_buffer */ +void vj_inc_wakecnt(struct vj_channel *chan) +{ + chan->ring->c.wakecnt++; + pr_debug("*** incremented wakecnt - should allow wake up\n"); +} +EXPORT_SYMBOL(vj_inc_wakecnt); + +static int is_empty(struct vj_channel_ring *ring) +{ + if (ring->c.head == ring->p.tail) + return 1; + return 0; +} + +static struct vj_buffer *get_buffer(unsigned int desc_num, + struct vj_channel *chan) +{ + struct vj_buffer *buf; + + if ((desc_num & VJ_HIGH_BIT) || (chan->num_local_buffers == 0)) { + desc_num &= ~VJ_HIGH_BIT; + BUG_ON(desc_num >= default_chan->num_local_buffers); + buf = (struct vj_buffer*)default_chan->descs[desc_num].address; + } else { + BUG_ON(desc_num >= chan->num_local_buffers); + buf = (struct vj_buffer *)chan->descs[desc_num].address; + } + + pr_debug(" received desc_num is %i\n", desc_num); + pr_debug("get_buffer %p (%s) %i: %p (len=%li ifind=%i hlen=%li) %#02X %#02X %#02X %#02X %#02X %#02X %#02X %#02X\n", + current, current->comm, desc_num, buf, buf->data_len, buf->ifindex, buf->header_len + (sizeof(struct iphdr *) * 4), + buf->data[0], buf->data[1], buf->data[2], buf->data[3], buf->data[4], buf->data[5], buf->data[6], buf->data[7]); + + return buf; +} + +static void release_buffer(struct vj_channel *chan, unsigned int descnum) +{ + if (descnum & VJ_HIGH_BIT) { + BUG_ON(test_bit(descnum & ~VJ_HIGH_BIT, + default_chan->used_descs) == 0); + clear_bit(descnum & ~VJ_HIGH_BIT, default_chan->used_descs); + } else { + BUG_ON(test_bit(descnum, chan->used_descs) == 0); + clear_bit(descnum, chan->used_descs); + } +} + +/* Free all descriptors for the current channel between where we last + * freed to and where the consumer has not yet consumed. chan->c.head + * is not cleared because it may not have been consumed, therefore + * chan->p.old_head is not cleared. If chan->p.old_head == + * chan->c.head then nothing more has been consumed since we last + * freed the descriptors. + * + * Because we're using local and global channels we need to select the + * bitmap according to the channel. Local channels may be pointing to + * local or global buffers, so we need to select the bitmap according + * to the buffer type */ + +/* Free descriptors consumer has consumed since last free */ +static void free_descs_for_channel(struct vj_channel *chan) +{ + struct vj_channel_ring *ring = chan->ring; + int desc_num; + + while (ring->p.old_head != ring->c.head) { + printk("ring->p.old_head %i, ring->c.head %i\n", ring->p.old_head, ring->c.head); + desc_num = ring->q[ring->p.old_head]; + + printk("desc_num %i\n", desc_num); + + /* FIXME: Security concerns: make sure this descriptor + * really used by this vjchannel. Userspace could + * have changed it. */ + release_buffer(chan, desc_num); + ring->p.old_head = (ring->p.old_head + 1) % VJ_NET_CHANNEL_ENTRIES; + printk("ring->p.old_head %i, ring->c.head %i\n\n", ring->p.old_head, ring->c.head); + } +} + +/* return -1 if no descriptor found and none can be freed */ +static int get_free_descriptor(struct vj_channel *chan) +{ + int free_desc, bitval; + + BUG_ON(chan->num_local_buffers == 0); + do { + free_desc = find_first_zero_bit(chan->used_descs, + chan->num_local_buffers); + pr_debug("free_desc = %i\n", free_desc); + if (free_desc >= chan->num_local_buffers) { + /* no descriptors, refresh bitmap and try again! */ + free_descs_for_channel(chan); + free_desc = find_first_zero_bit(chan->used_descs, + chan->num_local_buffers); + if (free_desc >= chan->num_local_buffers) + /* still no descriptors */ + return -1; + } + bitval = test_and_set_bit(free_desc, chan->used_descs); + pr_debug("bitval = %i\n", bitval); + } while (bitval == 1); //keep going until we get a FREE free bit! + + /* We set high bit to indicate a global channel. */ + if (chan == default_chan) + free_desc |= VJ_HIGH_BIT; + return free_desc; +} + +/* This function puts a buffer into a local address space for a + * channel that is unable to use a kernel address space. If address + * high bit is set then the buffer is in kernel space - get a free + * local buffer and copy it across. Set local buf to used (done when + * finding free buffer), kernel buf to unused. */ +/* FIXME: Loop, do as many as possible at once. */ +int vj_need_local_buffer(struct vj_channel *chan) +{ + struct vj_channel_ring *ring = chan->ring; + u32 new_desc, k_desc; + + k_desc = ring->q[ring->c.head]; + + if (ring->q[ring->c.head] & VJ_HIGH_BIT) { + struct vj_buffer *buf, *kbuf; + + kbuf = get_buffer(k_desc, chan); + new_desc = get_free_descriptor(chan); + if (new_desc == -1) + return -ENOBUFS; + buf = get_buffer(new_desc, chan); + memcpy (buf, kbuf, sizeof(struct vj_buffer) + + kbuf->data_len + kbuf->header_len); +/* clear the old descriptor and set q to new one */ + k_desc &= ~VJ_HIGH_BIT; + clear_bit(k_desc, default_chan->used_descs); + ring->q[ring->c.head] = new_desc; + } + return 0; +} +EXPORT_SYMBOL(vj_need_local_buffer); + +struct vj_buffer *vj_get_buffer(int *desc_num) +{ + *desc_num = get_free_descriptor(default_chan); + + if (*desc_num == -1) { + printk("no free bits!\n"); + return NULL; + } + + return get_buffer(*desc_num, default_chan); +} +EXPORT_SYMBOL(vj_get_buffer); + +static void enqueue_buffer(struct vj_channel *chan, struct vj_buffer *buffer, int desc_num) +{ + u16 tail, nxt; + int i; + + pr_debug("*** in enqueue buffer\n"); + pr_debug(" desc_num = %i\n", desc_num); + pr_debug(" Buffer Data Length = %lu\n", buffer->data_len); + pr_debug(" Buffer Header Length = %lu\n", buffer->header_len); + pr_debug(" Buffer Data:\n"); + for (i = 0; i < buffer->data_len; i++) { + pr_debug("%i ", buffer->data[i]); + if (i % 20 == 0) + pr_debug("\n"); + } + pr_debug("\n"); + + tail = chan->ring->p.tail; + nxt = (tail + 1) % VJ_NET_CHANNEL_ENTRIES; + + pr_debug("nxt = %i and chan->c.head = %i\n", nxt, chan->ring->c.head); + if (nxt != chan->ring->c.head) { + chan->ring->q[tail] = desc_num; + + smp_wmb(); + chan->ring->p.tail=nxt; + pr_debug("chan->p.wakecnt = %i and chan->c.wakecnt = %i\n", chan->ring->p.wakecnt, chan->ring->c.wakecnt); + free_descs_for_channel(chan); + if (chan->ring->p.wakecnt != chan->ring->c.wakecnt) { + ++chan->ring->p.wakecnt; + /* consume whatever is available */ + pr_debug("WAKE UP, CONSUMER!!!\n\n"); + wake_up(&chan->wq); + } + } else //if can't add it to chan, may as well allow it to be reused + release_buffer(chan, desc_num); +} + +/* FIXME: If we're going to do wildcards here, we need to do ordering between different partial matches... */ +static struct vj_channel *find_channel(u32 saddr, u32 daddr, u16 proto, u16 sport, u16 dport, u32 ifindex) +{ + struct vj_channel *i; + + pr_debug("args saddr %u, daddr %u, sport %u, dport %u, ifindex %u, proto %u\n", saddr, daddr, sport, dport, ifindex, proto); + + list_for_each_entry(i, &channels, list) { + pr_debug("saddr %u, daddr %u, sport %u, dport %u, ifindex %u, proto %u\n", i->flowid.saddr, i->flowid.daddr, i->flowid.sport, i->flowid.dport, i->flowid.ifindex, i->flowid.proto); + + if ((!i->flowid.saddr || i->flowid.saddr == saddr) && + (!i->flowid.daddr || i->flowid.daddr == daddr) && + (!i->flowid.proto || i->flowid.proto == proto) && + (!i->flowid.sport || i->flowid.sport == sport) && + (!i->flowid.dport || i->flowid.dport == dport) && + (!i->flowid.ifindex || i->flowid.ifindex == ifindex)) { + pr_debug("Found channel %p\n", i); + return i; + } + } + pr_debug("using default channel %p\n", default_chan); + return default_chan; +} + +void vj_netif_rx(struct vj_buffer *buffer, int desc_num, + unsigned short proto) +{ + struct vj_channel *chan; + struct iphdr *ip; + int iphl, offset, real_data_len; + u16 *ports; + unsigned long flags; + + offset = sizeof(struct iphdr) + sizeof(struct udphdr); + real_data_len = buffer->data_len - offset; + + + pr_debug("data_len = %lu, offset = %i, real data? = %i\n\n\n", buffer->data_len, offset, real_data_len); + /* this is always 18 when there's 18 or less characters in buffer->data */ + + pr_debug("rx) desc_num = %i\n\n", desc_num); + + spin_lock_irqsave(&chan_lock, flags); + if (proto == __constant_htons(ETH_P_IP)) { + + ip = (struct iphdr *)(buffer->data + buffer->header_len); + ports = (u16 *)(ip + 1); + iphl = ip->ihl * 4; + + if ((buffer->data_len < (iphl + 4)) || + (iphl != sizeof(struct iphdr))) { + pr_debug("Bad data, default chan\n"); + pr_debug("buffer data_len = %li, header len = %li, ip->ihl = %i\n", buffer->data_len, buffer->header_len, ip->ihl); + chan = default_chan; + } else { + chan = find_channel(ip->saddr, ip->daddr, + ip->protocol, ports[0], + ports[1], buffer->ifindex); + + } + } else + chan = default_chan; + enqueue_buffer(chan, buffer, desc_num); + + spin_unlock_irqrestore(&chan_lock, flags); +} +EXPORT_SYMBOL(vj_netif_rx); + +/* + * Determine the packet's protocol ID. The rule here is that we + * assume 802.3 if the type field is short enough to be a length. + * This is normal practice and works for any 'now in use' protocol. + */ + +unsigned short eth_vj_type_trans(struct vj_buffer *buffer) +{ + struct ethhdr *eth; + unsigned char *rawp; + + eth = (struct ethhdr *)buffer->data; + buffer->header_len = ETH_HLEN; + + BUG_ON(buffer->header_len > buffer->data_len); + + buffer->data_len -= buffer->header_len; + if (ntohs(eth->h_proto) >= 1536) + return eth->h_proto; + + rawp = buffer->data; + + /* + * This is a magic hack to spot IPX packets. Older Novell breaks + * the protocol design and runs IPX over 802.3 without an 802.2 LLC + * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This + * won't work for fault tolerant netware but does for the rest. + */ + if (*(unsigned short *)rawp == 0xFFFF) + return htons(ETH_P_802_3); + + /* + * Real 802.2 LLC + */ + return htons(ETH_P_802_2); +} +EXPORT_SYMBOL(eth_vj_type_trans); + +static void send_to_netif_rx(struct vj_buffer *buffer) +{ + struct sk_buff *skb; + struct net_device *dev; + int i; + + dev = dev_get_by_index(buffer->ifindex); + if (!dev) + return; + skb = dev_alloc_skb(buffer->data_len + 2); + if (skb == NULL) { + dev_put(dev); + return; + } + + skb_reserve(skb, 2); + skb->dev = dev; + + skb_put(skb, buffer->data_len); + memcpy(skb->data, buffer->data, buffer->data_len); + + pr_debug(" *** C buffer data_len = %lu and skb->len = %i\n", buffer->data_len, skb->len); + for (i = 0; i < 10; i++) + pr_debug("%i\n", skb->data[i]); + + skb->protocol = eth_type_trans(skb, skb->dev); + + netif_receive_skb(skb); +} + +/* handles default_chan (buffers that nobody else wants) */ +static int default_thread(void *unused) +{ + int consumed = 0; + int woken = 0; + struct vj_buffer *buffer; + wait_queue_t wait; + + /* When we get woken up, don't want to be removed from waitqueue! */ +//no more wait.task struct task_struct * task is now void *private + wait.private = current; + wait.func = default_wake_function; + INIT_LIST_HEAD(&wait.task_list); + + add_wait_queue(&default_chan->wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + while (!kthread_should_stop()) { + /* FIXME: if we do this before prepare_to_wait, avoids wmb */ + default_chan->ring->c.wakecnt++; + smp_wmb(); + + while (!is_empty(default_chan->ring)) { + smp_read_barrier_depends(); + buffer = get_buffer(default_chan->ring->q[default_chan->ring->c.head], default_chan); + pr_debug("calling send_to_netif_rx\n"); + send_to_netif_rx(buffer); + smp_rmb(); + default_chan->ring->c.head = (default_chan->ring->c.head+1)%VJ_NET_CHANNEL_ENTRIES; + consumed++; + } + + schedule(); + woken++; + set_current_state(TASK_INTERRUPTIBLE); + } + remove_wait_queue(&default_chan->wq, &wait); + + __set_current_state(TASK_RUNNING); + + pr_debug("consumer finished! consumed %i and woke %i\n", consumed, woken); + return 0; +} + +/* return the next buffer, but do not move on */ +struct vj_buffer *vj_peek_next_buffer(struct vj_channel *chan) +{ + struct vj_channel_ring *ring = chan->ring; + + if (is_empty(ring)) + return NULL; + return get_buffer(ring->q[ring->c.head], chan); +} +EXPORT_SYMBOL(vj_peek_next_buffer); + +/* move on to next buffer */ +void vj_done_with_buffer(struct vj_channel *chan) +{ + struct vj_channel_ring *ring = chan->ring; + + ring->c.head = (ring->c.head+1)%VJ_NET_CHANNEL_ENTRIES; + + pr_debug("done_with_buffer\n\n"); +} +EXPORT_SYMBOL(vj_done_with_buffer); + +struct vj_channel *vj_alloc_chan(int num_buffers) +{ + int i; + struct vj_channel *chan = kmalloc(sizeof(*chan), GFP_KERNEL); + + if (!chan) + return NULL; + + chan->ring = (void *)get_zeroed_page(GFP_KERNEL); + if (chan->ring == NULL) + goto free_chan; + + init_waitqueue_head(&chan->wq); + chan->ring->p.tail = chan->ring->p.wakecnt = chan->ring->p.old_head = chan->ring->c.head = chan->ring->c.wakecnt = 0; + + chan->num_local_buffers = num_buffers; + if (chan->num_local_buffers == 0) + return chan; + + chan->used_descs = kzalloc(BITS_TO_LONGS(chan->num_local_buffers) + * sizeof(long), GFP_KERNEL); + if (chan->used_descs == NULL) + goto free_ring; + chan->descs = kmalloc(sizeof(*chan->descs)*num_buffers, GFP_KERNEL); + if (chan->descs == NULL) + goto free_used_descs; + for (i = 0; i < chan->num_local_buffers; i++) { + chan->descs[i].buffer_len = PAGE_SIZE; + chan->descs[i].address = get_zeroed_page(GFP_KERNEL); + if (chan->descs[i].address == 0) + goto free_descs; + } + + return chan; + +free_descs: + for (--i; i >= 0; i--) + free_page(chan->descs[i].address); + kfree(chan->descs); +free_used_descs: + kfree(chan->used_descs); +free_ring: + free_page((unsigned long)chan->ring); +free_chan: + kfree(chan); + return NULL; +} +EXPORT_SYMBOL(vj_alloc_chan); + +void vj_register_chan(struct vj_channel *chan, const struct vj_flowid *flowid) +{ + pr_debug("%p %s: registering channel %p\n", + current, current->comm, chan); + chan->flowid = *flowid; + spin_lock_irq(&chan_lock); + list_add(&chan->list, &channels); + spin_unlock_irq(&chan_lock); +} +EXPORT_SYMBOL(vj_register_chan); + +void vj_unregister_chan(struct vj_channel *chan) +{ + pr_debug("%p %s: unregistering channel %p\n", + current, current->comm, chan); + spin_lock_irq(&chan_lock); + list_del(&chan->list); + spin_unlock_irq(&chan_lock); +} +EXPORT_SYMBOL(vj_unregister_chan); + +void vj_free_chan(struct vj_channel *chan) +{ + pr_debug("%p %s: freeing channel %p\n", + current, current->comm, chan); + /* FIXME: Mark any buffer still in channel as free! */ + kfree(chan); +} +EXPORT_SYMBOL(vj_free_chan); + + + +/* not using at the mo - working on rx, not tx */ +int vj_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct vj_buffer *buffer; + /* first element in dev priv data must be addr of net_channel */ +// struct net_channel *chan = *(struct net_channel **) netdev_priv(dev) + 1; + int desc_num; + + buffer = vj_get_buffer(&desc_num); + buffer->data_len = skb->len; + memcpy(buffer->data, skb->data, buffer->data_len); +// enqueue_buffer(chan, buffer, desc_num); + + kfree(skb); + return 0; +} +EXPORT_SYMBOL(vj_xmit); + +static int __init init(void) +{ + default_chan = vj_alloc_chan(NUM_GLOBAL_DESCRIPTORS); + if (!default_chan) + return -ENOMEM; + + kthread_run(default_thread, NULL, "kvj_net"); + return 0; +} + +module_init(init); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("VJ Channel Networking Module."); +MODULE_AUTHOR("Kelly Daly <[EMAIL PROTECTED]>"); - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html