ser -> openser from wr
[openwrt.git] / openwrt / target / linux / generic-2.6 / patches / 104-pf_ring.patch
1 diff --unified --recursive --new-file linux-2.6.15-rc6/include/linux/ring.h linux-2.6.15-rc6-1-686-smp-ring3/include/linux/ring.h
2 --- linux-2.6.15-rc6/include/linux/ring.h       1970-01-01 01:00:00.000000000 +0100
3 +++ linux-2.6.15-rc6-1-686-smp-ring3/include/linux/ring.h       2005-12-24 00:24:01.000000000 +0100
4 @@ -0,0 +1,107 @@
5 +/*
6 + * Definitions for packet ring
7 + *
8 + * 2004 - Luca Deri <deri@ntop.org>
9 + */
10 +#ifndef __RING_H
11 +#define __RING_H
12 +
13 +
14 +#define INCLUDE_MAC_INFO
15 +
16 +#ifdef INCLUDE_MAC_INFO
17 +#define SKB_DISPLACEMENT    14 /* Include MAC address information */
18 +#else
19 +#define SKB_DISPLACEMENT    0  /* Do NOT include MAC address information */
20 +#endif
21 +
22 +#define RING_MAGIC
23 +#define RING_MAGIC_VALUE      0x88
24 +#define RING_FLOWSLOT_VERSION    6
25 +#define RING_VERSION          "3.1"
26 +
27 +#define SO_ADD_TO_CLUSTER        99
28 +#define SO_REMOVE_FROM_CLUSTER  100
29 +#define SO_SET_REFLECTOR        101
30 +
31 +/* *********************************** */
32 +
33 +#ifndef HAVE_PCAP
34 +struct pcap_pkthdr {
35 +  struct timeval ts;    /* time stamp */
36 +  u_int32_t caplen;     /* length of portion present */
37 +  u_int32_t len;        /* length this packet (off wire) */
38 +};
39 +#endif
40 +
41 +/* *********************************** */
42 +
43 +enum cluster_type {
44 +  cluster_per_flow = 0,
45 +  cluster_round_robin
46 +};
47 +
48 +/* *********************************** */
49 +
50 +#define RING_MIN_SLOT_SIZE    (60+sizeof(struct pcap_pkthdr))
51 +#define RING_MAX_SLOT_SIZE    (1514+sizeof(struct pcap_pkthdr))
52 +
53 +/* *********************************** */
54 +
55 +typedef struct flowSlotInfo {
56 +  u_int16_t version, sample_rate;
57 +  u_int32_t tot_slots, slot_len, data_len, tot_mem;
58 +  
59 +  u_int64_t tot_pkts, tot_lost;
60 +  u_int64_t tot_insert, tot_read;  
61 +  u_int32_t insert_idx, remove_idx;
62 +} FlowSlotInfo;
63 +
64 +/* *********************************** */
65 +
66 +typedef struct flowSlot {
67 +#ifdef RING_MAGIC
68 +  u_char     magic;      /* It must alwasy be zero */
69 +#endif
70 +  u_char     slot_state; /* 0=empty, 1=full   */
71 +  u_char     bucket;     /* bucket[bucketLen] */
72 +} FlowSlot;
73 +
74 +/* *********************************** */
75 +
76 +#ifdef __KERNEL__ 
77 +
78 +FlowSlotInfo* getRingPtr(void);
79 +int allocateRing(char *deviceName, u_int numSlots,
80 +                u_int bucketLen, u_int sampleRate);
81 +unsigned int pollRing(struct file *fp, struct poll_table_struct * wait);
82 +void deallocateRing(void);
83 +
84 +/* ************************* */
85 +
86 +typedef int (*handle_ring_skb)(struct sk_buff *skb,
87 +                              u_char recv_packet, u_char real_skb);
88 +extern handle_ring_skb get_skb_ring_handler(void);
89 +extern void set_skb_ring_handler(handle_ring_skb the_handler);
90 +extern void do_skb_ring_handler(struct sk_buff *skb,
91 +                               u_char recv_packet, u_char real_skb);
92 +
93 +typedef int (*handle_ring_buffer)(struct net_device *dev, 
94 +                                    char *data, int len);
95 +extern handle_ring_buffer get_buffer_ring_handler(void);
96 +extern void set_buffer_ring_handler(handle_ring_buffer the_handler);
97 +extern int do_buffer_ring_handler(struct net_device *dev,
98 +                                 char *data, int len);
99 +#endif /* __KERNEL__  */
100 +
101 +/* *********************************** */
102 +
103 +#define PF_RING          27      /* Packet Ring */
104 +#define SOCK_RING        PF_RING
105 +
106 +/* ioctl() */
107 +#define SIORINGPOLL      0x8888
108 +
109 +/* *********************************** */
110 +
111 +#endif /* __RING_H */
112 diff --unified --recursive --new-file linux-2.6.15-rc6/net/core/dev.c linux-2.6.15-rc6-1-686-smp-ring3/net/core/dev.c
113 --- linux-2.6.15-rc6/net/core/dev.c     2005-12-19 01:36:54.000000000 +0100
114 +++ linux-2.6.15-rc6-1-686-smp-ring3/net/core/dev.c     2005-12-24 00:24:02.000000000 +0100
115 @@ -115,6 +115,56 @@
116  #endif /* CONFIG_NET_RADIO */
117  #include <asm/current.h>
118  
119 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
120 +
121 +/* #define RING_DEBUG */
122 +
123 +#include <linux/ring.h>
124 +#include <linux/version.h>
125 +
126 +static handle_ring_skb ring_handler = NULL;
127 +
128 +handle_ring_skb get_skb_ring_handler() { return(ring_handler); }
129 +
130 +void set_skb_ring_handler(handle_ring_skb the_handler) {
131 +  ring_handler = the_handler;
132 +}
133 +
134 +void do_skb_ring_handler(struct sk_buff *skb,
135 +                        u_char recv_packet, u_char real_skb) {
136 +  if(ring_handler)
137 +    ring_handler(skb, recv_packet, real_skb);
138 +}
139 +
140 +/* ******************* */
141 +
142 +static handle_ring_buffer buffer_ring_handler = NULL;
143 +
144 +handle_ring_buffer get_buffer_ring_handler() { return(buffer_ring_handler); }
145 +
146 +void set_buffer_ring_handler(handle_ring_buffer the_handler) {
147 +  buffer_ring_handler = the_handler;
148 +}
149 +
150 +int do_buffer_ring_handler(struct net_device *dev, char *data, int len) {
151 +  if(buffer_ring_handler) {
152 +    buffer_ring_handler(dev, data, len);
153 +    return(1);
154 +  } else 
155 +    return(0);
156 +}
157 +
158 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
159 +EXPORT_SYMBOL(get_skb_ring_handler);
160 +EXPORT_SYMBOL(set_skb_ring_handler);
161 +EXPORT_SYMBOL(do_skb_ring_handler);
162 +
163 +EXPORT_SYMBOL(get_buffer_ring_handler);
164 +EXPORT_SYMBOL(set_buffer_ring_handler);
165 +EXPORT_SYMBOL(do_buffer_ring_handler);
166 +#endif
167 +
168 +#endif
169  /*
170   *     The list of packet types we will receive (as opposed to discard)
171   *     and the routines to invoke.
172 @@ -1296,6 +1346,10 @@
173         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
174  #endif
175         if (q->enqueue) {
176 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
177 +       if(ring_handler) ring_handler(skb, 0, 1);
178 +#endif /* CONFIG_RING */
179 +
180                 /* Grab device queue */
181                 spin_lock(&dev->queue_lock);
182  
183 @@ -1437,6 +1491,13 @@
184  
185         preempt_disable();
186         err = netif_rx(skb);
187 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
188 +       if(ring_handler && ring_handler(skb, 1, 1)) {
189 +         /* The packet has been copied into a ring */
190 +         return(NET_RX_SUCCESS);
191 +       }
192 +#endif /* CONFIG_RING */
193 +
194         if (local_softirq_pending())
195                 do_softirq();
196         preempt_enable();
197 @@ -1582,6 +1643,13 @@
198         struct net_device *orig_dev;
199         int ret = NET_RX_DROP;
200         unsigned short type;
201 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
202 +       if(ring_handler && ring_handler(skb, 1, 1)) {
203 +         /* The packet has been copied into a ring */
204 +         return(NET_RX_SUCCESS);
205 +       }
206 +#endif /* CONFIG_RING */
207 +
208  
209         /* if we've gotten here through NAPI, check netpoll */
210         if (skb->dev->poll && netpoll_rx(skb))
211 diff --unified --recursive --new-file linux-2.6.15-rc6/net/Kconfig linux-2.6.15-rc6-1-686-smp-ring3/net/Kconfig
212 --- linux-2.6.15-rc6/net/Kconfig        2005-12-19 01:36:54.000000000 +0100
213 +++ linux-2.6.15-rc6-1-686-smp-ring3/net/Kconfig        2005-12-24 00:24:02.000000000 +0100
214 @@ -31,6 +31,7 @@
215  source "net/unix/Kconfig"
216  source "net/xfrm/Kconfig"
217  
218 +source "net/ring/Kconfig"
219  config INET
220         bool "TCP/IP networking"
221         ---help---
222 diff --unified --recursive --new-file linux-2.6.15-rc6/net/Makefile linux-2.6.15-rc6-1-686-smp-ring3/net/Makefile
223 --- linux-2.6.15-rc6/net/Makefile       2005-12-19 01:36:54.000000000 +0100
224 +++ linux-2.6.15-rc6-1-686-smp-ring3/net/Makefile       2005-12-24 00:24:02.000000000 +0100
225 @@ -42,6 +42,7 @@
226  obj-$(CONFIG_DECNET)           += decnet/
227  obj-$(CONFIG_ECONET)           += econet/
228  obj-$(CONFIG_VLAN_8021Q)       += 8021q/
229 +obj-$(CONFIG_RING)             += ring/
230  obj-$(CONFIG_IP_DCCP)          += dccp/
231  obj-$(CONFIG_IP_SCTP)          += sctp/
232  obj-$(CONFIG_IEEE80211)                += ieee80211/
233 diff --unified --recursive --new-file linux-2.6.15-rc6/net/ring/Kconfig linux-2.6.15-rc6-1-686-smp-ring3/net/ring/Kconfig
234 --- linux-2.6.15-rc6/net/ring/Kconfig   1970-01-01 01:00:00.000000000 +0100
235 +++ linux-2.6.15-rc6-1-686-smp-ring3/net/ring/Kconfig   2005-12-24 00:24:02.000000000 +0100
236 @@ -0,0 +1,14 @@
237 +config RING
238 +       tristate "PF_RING sockets (EXPERIMENTAL)"
239 +       depends on EXPERIMENTAL
240 +       ---help---
241 +         PF_RING socket family, optimized for packet capture.
242 +          If a PF_RING socket is bound to an adapter (via the bind() system
243 +          call), such adapter will be used in read-only mode until the socket
244 +          is destroyed. Whenever an incoming packet is received from the adapter
245 +          it will not passed to upper layers, but instead it is copied to a ring
246 +          buffer, which in turn is exported to user space applications via mmap.
247 +          Please refer to http://luca.ntop.org/Ring.pdf for more.
248 +
249 +         Say N unless you know what you are doing.
250 +
251 diff --unified --recursive --new-file linux-2.6.15-rc6/net/ring/Makefile linux-2.6.15-rc6-1-686-smp-ring3/net/ring/Makefile
252 --- linux-2.6.15-rc6/net/ring/Makefile  1970-01-01 01:00:00.000000000 +0100
253 +++ linux-2.6.15-rc6-1-686-smp-ring3/net/ring/Makefile  2005-12-24 00:24:02.000000000 +0100
254 @@ -0,0 +1,7 @@
255 +#
256 +# Makefile for the ring driver.
257 +#
258 +
259 +obj-m += ring.o
260 +
261 +ring-objs := ring_packet.o
262 diff --unified --recursive --new-file linux-2.6.15-rc6/net/ring/ring_packet.c linux-2.6.15-rc6-1-686-smp-ring3/net/ring/ring_packet.c
263 --- linux-2.6.15-rc6/net/ring/ring_packet.c     1970-01-01 01:00:00.000000000 +0100
264 +++ linux-2.6.15-rc6-1-686-smp-ring3/net/ring/ring_packet.c     2005-12-24 00:24:02.000000000 +0100
265 @@ -0,0 +1,1568 @@
266 +/*
267 + *
268 + * (C) 2004-05 - Luca Deri <deri@ntop.org>
269 + *
270 + * This code includes contributions courtesy of
271 + * - Jeff Randall <jrandall@nexvu.com>
272 + * - Helmut Manck <helmut.manck@secunet.com>
273 + * - Brad Doctor <brad@stillsecure.com>
274 + * - Amit D. Chaudhary <amit_ml@rajgad.com>
275 + *
276 + */
277 +
278 +/* 
279 +   TO DO:
280 +   add an entry inside the /proc filesystem 
281 +*/
282 +
283 +#include <linux/version.h>
284 +#include <linux/config.h>
285 +#include <linux/module.h>
286 +#include <linux/kernel.h>
287 +#include <linux/socket.h>
288 +#include <linux/skbuff.h>
289 +#include <linux/rtnetlink.h>
290 +#include <linux/in.h>
291 +#include <linux/in6.h>
292 +#include <linux/init.h>
293 +#include <linux/filter.h>
294 +#include <linux/ring.h>
295 +#include <linux/ip.h>
296 +#include <linux/tcp.h>
297 +#include <linux/udp.h>
298 +#include <linux/list.h>
299 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
300 +#include <net/xfrm.h>
301 +#else
302 +#include <linux/poll.h>
303 +#endif
304 +#include <net/sock.h>
305 +#include <asm/io.h>   /* needed for virt_to_phys() */
306 +
307 +/* #define RING_DEBUG */
308 +
309 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11))
310 +static inline int remap_page_range(struct vm_area_struct *vma,
311 +                                  unsigned long uvaddr,
312 +                                  unsigned long paddr,
313 +                                  unsigned long size,
314 +                                  pgprot_t prot) {
315 +  return(remap_pfn_range(vma, uvaddr, paddr >> PAGE_SHIFT,
316 +                        size, prot));
317 +}
318 +#endif
319 +
320 +/* ************************************************* */
321 +
322 +#define CLUSTER_LEN       8
323 +
324 +struct ring_cluster {
325 +  u_short             cluster_id; /* 0 = no cluster */
326 +  u_short             num_cluster_elements;
327 +  enum cluster_type   hashing_mode;
328 +  u_short             hashing_id;
329 +  struct sock         *sk[CLUSTER_LEN];
330 +  struct ring_cluster *next;      /* NULL = last element of the cluster */
331 +};
332 +
333 +/* ************************************************* */
334 +
335 +struct ring_element {
336 +  struct list_head  list;
337 +  struct sock      *sk;
338 +};
339 +
340 +/* ************************************************* */
341 +
342 +struct ring_opt {
343 +  struct net_device *ring_netdev;
344 +
345 +  /* Cluster */
346 +  u_short cluster_id; /* 0 = no cluster */
347 +
348 +  /* Reflector */
349 +  struct net_device *reflector_dev;
350 +
351 +  /* Packet buffers */
352 +  unsigned long order;
353 +
354 +  /* Ring Slots */
355 +  unsigned long ring_memory;
356 +  FlowSlotInfo *slots_info; /* Basically it points to ring_memory */
357 +  char *ring_slots;  /* Basically it points to ring_memory
358 +                       +sizeof(FlowSlotInfo) */
359 +
360 +  /* Packet Sampling */
361 +  u_int pktToSample, sample_rate;
362 +
363 +  /* BPF Filter */
364 +  struct sk_filter *bpfFilter;
365 +
366 +  /* Locks */
367 +  atomic_t num_ring_slots_waiters;
368 +  wait_queue_head_t ring_slots_waitqueue;
369 +  rwlock_t ring_index_lock;
370 +
371 +  /* Indexes (Internal) */
372 +  u_int insert_page_id, insert_slot_id;
373 +};
374 +
375 +/* ************************************************* */
376 +
377 +/* List of all ring sockets. */
378 +static struct list_head ring_table;
379 +
380 +/* List of all clusters */
381 +static struct ring_cluster *ring_cluster_list;
382 +
383 +static rwlock_t ring_mgmt_lock = RW_LOCK_UNLOCKED;
384 +
385 +/* ********************************** */
386 +
387 +/* Forward */
388 +static struct proto_ops ring_ops;
389 +
390 +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11))
391 +static struct proto ring_proto;
392 +#endif
393 +
394 +static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet,
395 +                           u_char real_skb);
396 +static int buffer_ring_handler(struct net_device *dev, char *data, int len);
397 +static int remove_from_cluster(struct sock *sock, struct ring_opt *pfr);
398 +
399 +/* Extern */
400 +
401 +/* ********************************** */
402 +
403 +/* Defaults */
404 +static u_int bucket_len = 128, num_slots = 4096, sample_rate = 1,
405 +  transparent_mode = 1, enable_tx_capture = 0;
406 +
407 +MODULE_PARM(bucket_len, "i");
408 +MODULE_PARM_DESC(bucket_len, "Number of ring buckets");
409 +MODULE_PARM(num_slots,  "i");
410 +MODULE_PARM_DESC(num_slots,  "Number of ring slots");
411 +MODULE_PARM(sample_rate, "i");
412 +MODULE_PARM_DESC(sample_rate, "Ring packet sample rate");
413 +MODULE_PARM(transparent_mode, "i");
414 +MODULE_PARM_DESC(transparent_mode,
415 +                "Set to 1 to set transparent mode "
416 +                "(slower but backwards compatible)");
417 +MODULE_PARM(enable_tx_capture, "i");
418 +MODULE_PARM_DESC(enable_tx_capture, "Set to 1 to capture outgoing packets");
419 +
420 +/* ********************************** */
421 +
422 +#define MIN_QUEUED_PKTS      64
423 +#define MAX_QUEUE_LOOPS      64
424 +
425 +
426 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
427 +#define ring_sk_datatype(__sk) ((struct ring_opt *)__sk)
428 +#define ring_sk(__sk) ((__sk)->sk_protinfo)
429 +#else
430 +#define ring_sk_datatype(a) (a)
431 +#define ring_sk(__sk) ((__sk)->protinfo.pf_ring)
432 +#endif
433 +
434 +#define _rdtsc() ({ uint64_t x; asm volatile("rdtsc" : "=A" (x)); x; })
435 +
436 +/*
437 +  int dev_queue_xmit(struct sk_buff *skb)
438 +  skb->dev;
439 +  struct net_device *dev_get_by_name(const char *name)
440 +*/
441 +
442 +/* ********************************** */
443 +
444 +static void ring_sock_destruct(struct sock *sk) {
445 +
446 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
447 +  skb_queue_purge(&sk->sk_receive_queue);
448 +
449 +  if (!sock_flag(sk, SOCK_DEAD)) {
450 +#if defined(RING_DEBUG)
451 +    printk("Attempt to release alive ring socket: %p\n", sk);
452 +#endif
453 +    return;
454 +  }
455 +
456 +  BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
457 +  BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
458 +#else
459 +
460 +  BUG_TRAP(atomic_read(&sk->rmem_alloc)==0);
461 +  BUG_TRAP(atomic_read(&sk->wmem_alloc)==0);
462 +
463 +  if (!sk->dead) {
464 +#if defined(RING_DEBUG)
465 +    printk("Attempt to release alive ring socket: %p\n", sk);
466 +#endif
467 +    return;
468 +  }
469 +#endif
470 +
471 +  kfree(ring_sk(sk));
472 +
473 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
474 +  MOD_DEC_USE_COUNT;
475 +#endif
476 +}
477 +
478 +/* ********************************** */
479 +/*
480 + * ring_insert()
481 + *
482 + * store the sk in a new element and add it
483 + * to the head of the list.
484 + */
485 +static inline void ring_insert(struct sock *sk) {
486 +  struct ring_element *next;
487 +
488 +#if defined(RING_DEBUG)
489 +  printk("RING: ring_insert()\n");
490 +#endif
491 +
492 +  next = kmalloc(sizeof(struct ring_element), GFP_ATOMIC);
493 +  if(next != NULL) {
494 +    next->sk = sk;
495 +    write_lock_irq(&ring_mgmt_lock);
496 +    list_add(&next->list, &ring_table);
497 +    write_unlock_irq(&ring_mgmt_lock);
498 +  } else {
499 +    if (net_ratelimit())
500 +      printk("RING: could not kmalloc slot!!\n");
501 +  }
502 +}
503 +
504 +/* ********************************** */
505 +/*
506 + * ring_remove()
507 + *
508 + * For each of the elements in the list:
509 + *  - check if this is the element we want to delete
510 + *  - if it is, remove it from the list, and free it.
511 + *
512 + * stop when we find the one we're looking for (break),
513 + * or when we reach the end of the list.
514 + */
515 +static inline void ring_remove(struct sock *sk) {
516 +  struct list_head *ptr;
517 +  struct ring_element *entry;
518 +
519 +
520 +  for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
521 +    entry = list_entry(ptr, struct ring_element, list);
522 +
523 +    if(entry->sk == sk) {
524 +      write_lock_irq(&ring_mgmt_lock);
525 +      list_del(ptr);
526 +      kfree(ptr);
527 +      write_unlock_irq(&ring_mgmt_lock);
528 +      break;
529 +    }
530 +  }
531 +
532 +}
533 +
534 +/* ********************************** */
535 +
536 +static u_int32_t num_queued_pkts(struct ring_opt *pfr) {
537 +
538 +  if(pfr->ring_slots != NULL) {
539 +
540 +    u_int32_t tot_insert = pfr->slots_info->insert_idx,
541 +#if defined(RING_DEBUG)
542 +      tot_read = pfr->slots_info->tot_read, tot_pkts;
543 +#else
544 +    tot_read = pfr->slots_info->tot_read;
545 +#endif
546 +
547 +    if(tot_insert >= tot_read) {
548 +#if defined(RING_DEBUG)
549 +      tot_pkts = tot_insert-tot_read;
550 +#endif
551 +      return(tot_insert-tot_read);
552 +    } else {
553 +#if defined(RING_DEBUG)
554 +      tot_pkts = ((u_int32_t)-1)+tot_insert-tot_read;
555 +#endif
556 +      return(((u_int32_t)-1)+tot_insert-tot_read);
557 +    }
558 +
559 +#if defined(RING_DEBUG)
560 +    printk("-> num_queued_pkts=%d [tot_insert=%d][tot_read=%d]\n",
561 +          tot_pkts, tot_insert, tot_read);
562 +#endif
563 +
564 +  } else
565 +    return(0);
566 +}
567 +
568 +/* ********************************** */
569 +
570 +static inline FlowSlot* get_insert_slot(struct ring_opt *pfr) {
571 +#if defined(RING_DEBUG)
572 +  printk("get_insert_slot(%d)\n", pfr->slots_info->insert_idx);
573 +#endif
574 +
575 +  if(pfr->ring_slots != NULL) {
576 +    FlowSlot *slot = (FlowSlot*)&(pfr->ring_slots[pfr->slots_info->insert_idx
577 +                                                 *pfr->slots_info->slot_len]);
578 +    return(slot);
579 +  } else
580 +    return(NULL);
581 +}
582 +
583 +/* ********************************** */
584 +
585 +static inline FlowSlot* get_remove_slot(struct ring_opt *pfr) {
586 +#if defined(RING_DEBUG)
587 +  printk("get_remove_slot(%d)\n", pfr->slots_info->remove_idx);
588 +#endif
589 +
590 +  if(pfr->ring_slots != NULL)
591 +    return((FlowSlot*)&(pfr->ring_slots[pfr->slots_info->remove_idx*
592 +                                       pfr->slots_info->slot_len]));
593 +  else
594 +    return(NULL);
595 +}
596 +
597 +/* ********************************** */
598 +
599 +static void add_skb_to_ring(struct sk_buff *skb,
600 +                           struct ring_opt *pfr,
601 +                           u_char recv_packet,
602 +                           u_char real_skb /* 1=skb 0=faked skb */) {
603 +  FlowSlot *theSlot;
604 +  int idx, displ;
605 +
606 +  if(recv_packet) {
607 +    /* Hack for identifying a packet received by the e1000 */
608 +    if(real_skb) {
609 +      displ = SKB_DISPLACEMENT;
610 +    } else
611 +      displ = 0; /* Received by the e1000 wrapper */
612 +  } else
613 +    displ = 0;
614 +
615 +  write_lock(&pfr->ring_index_lock);
616 +  pfr->slots_info->tot_pkts++;
617 +  write_unlock(&pfr->ring_index_lock);
618 +
619 +  /* BPF Filtering (from af_packet.c) */
620 +  if(pfr->bpfFilter != NULL) {
621 +    unsigned res = 1, len;
622 +
623 +    len = skb->len-skb->data_len;
624 +
625 +    write_lock(&pfr->ring_index_lock);
626 +    skb->data -= displ;
627 +    res = sk_run_filter(skb, pfr->bpfFilter->insns, pfr->bpfFilter->len);
628 +    skb->data += displ;
629 +    write_unlock(&pfr->ring_index_lock);
630 +
631 +    if(res == 0) {
632 +      /* Filter failed */
633 +
634 +#if defined(RING_DEBUG)
635 +      printk("add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]"
636 +            "[insertIdx=%d][pkt_type=%d][cloned=%d]\n",
637 +            (int)skb->len, pfr->slots_info->tot_pkts,
638 +            pfr->slots_info->insert_idx,
639 +            skb->pkt_type, skb->cloned);
640 +#endif
641 +
642 +      return;
643 +    }
644 +  }
645 +
646 +  /* ************************** */
647 +
648 +  if(pfr->sample_rate > 1) {
649 +    if(pfr->pktToSample == 0) {
650 +      write_lock(&pfr->ring_index_lock);
651 +      pfr->pktToSample = pfr->sample_rate;
652 +      write_unlock(&pfr->ring_index_lock);
653 +    } else {
654 +      write_lock(&pfr->ring_index_lock);
655 +      pfr->pktToSample--;
656 +      write_unlock(&pfr->ring_index_lock);
657 +
658 +#if defined(RING_DEBUG)
659 +      printk("add_skb_to_ring(skb): sampled packet [len=%d]"
660 +            "[tot=%llu][insertIdx=%d][pkt_type=%d][cloned=%d]\n",
661 +            (int)skb->len, pfr->slots_info->tot_pkts,
662 +            pfr->slots_info->insert_idx,
663 +            skb->pkt_type, skb->cloned);
664 +#endif
665 +      return;
666 +    }
667 +  }
668 +
669 +  /* ************************************* */
670 +
671 +  if((pfr->reflector_dev != NULL)
672 +     && (!netif_queue_stopped(pfr->reflector_dev))) {
673 +    int cpu = smp_processor_id();
674 +
675 +    /* increase reference counter so that this skb is not freed */
676 +    atomic_inc(&skb->users);
677 +
678 +    skb->data -= displ;
679 +
680 +    /* send it */
681 +    if (pfr->reflector_dev->xmit_lock_owner != cpu) {
682 +      spin_lock_bh(&pfr->reflector_dev->xmit_lock);
683 +      pfr->reflector_dev->xmit_lock_owner = cpu;
684 +      spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
685 +
686 +      if (pfr->reflector_dev->hard_start_xmit(skb,
687 +                                             pfr->reflector_dev) == 0) {
688 +        spin_lock_bh(&pfr->reflector_dev->xmit_lock);
689 +       pfr->reflector_dev->xmit_lock_owner = -1;
690 +       skb->data += displ;
691 +       spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
692 +#if defined(RING_DEBUG)
693 +       printk("++ hard_start_xmit succeeded\n");
694 +#endif
695 +       return; /* OK */
696 +      }
697 +
698 +      spin_lock_bh(&pfr->reflector_dev->xmit_lock);
699 +      pfr->reflector_dev->xmit_lock_owner = -1;
700 +      spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
701 +    }
702 +
703 +#if defined(RING_DEBUG)
704 +    printk("++ hard_start_xmit failed\n");
705 +#endif
706 +    skb->data += displ;
707 +    return; /* -ENETDOWN */
708 +  }
709 +
710 +  /* ************************************* */
711 +
712 +#if defined(RING_DEBUG)
713 +  printk("add_skb_to_ring(skb) [len=%d][tot=%llu][insertIdx=%d]"
714 +        "[pkt_type=%d][cloned=%d]\n",
715 +        (int)skb->len, pfr->slots_info->tot_pkts,
716 +        pfr->slots_info->insert_idx,
717 +        skb->pkt_type, skb->cloned);
718 +#endif
719 +
720 +  idx = pfr->slots_info->insert_idx;
721 +  theSlot = get_insert_slot(pfr);
722 +
723 +  if((theSlot != NULL) && (theSlot->slot_state == 0)) {
724 +    struct pcap_pkthdr *hdr;
725 +    char *bucket;
726 +
727 +    /* Update Index */
728 +    idx++;
729 +
730 +    if(idx == pfr->slots_info->tot_slots) {
731 +      write_lock(&pfr->ring_index_lock);
732 +      pfr->slots_info->insert_idx = 0;
733 +      write_unlock(&pfr->ring_index_lock);
734 +    } else {
735 +      write_lock(&pfr->ring_index_lock);
736 +      pfr->slots_info->insert_idx = idx;
737 +      write_unlock(&pfr->ring_index_lock);
738 +    }
739 +
740 +    bucket = &theSlot->bucket;
741 +    hdr = (struct pcap_pkthdr*)bucket;
742 +
743 +    /* BD - API changed for time keeping */
744 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14))
745 +    if(skb->stamp.tv_sec == 0) do_gettimeofday(&skb->stamp);
746 +
747 +    hdr->ts.tv_sec = skb->stamp.tv_sec, hdr->ts.tv_usec = skb->stamp.tv_usec;
748 +#else
749 +    if(skb->tstamp.off_sec == 0) __net_timestamp(skb);
750 +
751 +    hdr->ts.tv_sec = skb->tstamp.off_sec, hdr->ts.tv_usec = skb->tstamp.off_usec;
752 +#endif
753 +
754 +    hdr->caplen    = skb->len+displ;
755 +
756 +    if(hdr->caplen > pfr->slots_info->data_len)
757 +      hdr->caplen = pfr->slots_info->data_len;
758 +
759 +    hdr->len = skb->len+displ;
760 +    memcpy(&bucket[sizeof(struct pcap_pkthdr)],
761 +          skb->data-displ, hdr->caplen);
762 +
763 +#if defined(RING_DEBUG)
764 +    {
765 +      static unsigned int lastLoss = 0;
766 +
767 +      if(pfr->slots_info->tot_lost
768 +        && (lastLoss != pfr->slots_info->tot_lost)) {
769 +       printk("add_skb_to_ring(%d): [data_len=%d]"
770 +              "[hdr.caplen=%d][skb->len=%d]"
771 +              "[pcap_pkthdr=%d][removeIdx=%d]"
772 +              "[loss=%lu][page=%u][slot=%u]\n",
773 +              idx-1, pfr->slots_info->data_len, hdr->caplen, skb->len,
774 +              sizeof(struct pcap_pkthdr),
775 +              pfr->slots_info->remove_idx,
776 +              (long unsigned int)pfr->slots_info->tot_lost,
777 +              pfr->insert_page_id, pfr->insert_slot_id);
778 +
779 +       lastLoss = pfr->slots_info->tot_lost;
780 +      }
781 +    }
782 +#endif
783 +
784 +    write_lock(&pfr->ring_index_lock);
785 +    pfr->slots_info->tot_insert++;
786 +    theSlot->slot_state = 1;
787 +    write_unlock(&pfr->ring_index_lock);
788 +  } else {
789 +    write_lock(&pfr->ring_index_lock);
790 +    pfr->slots_info->tot_lost++;
791 +    write_unlock(&pfr->ring_index_lock);
792 +
793 +#if defined(RING_DEBUG)
794 +    printk("add_skb_to_ring(skb): packet lost [loss=%lu]"
795 +          "[removeIdx=%u][insertIdx=%u]\n",
796 +          (long unsigned int)pfr->slots_info->tot_lost,
797 +          pfr->slots_info->remove_idx, pfr->slots_info->insert_idx);
798 +#endif
799 +  }
800 +
801 +  /* wakeup in case of poll() */
802 +  if(waitqueue_active(&pfr->ring_slots_waitqueue))
803 +    wake_up_interruptible(&pfr->ring_slots_waitqueue);
804 +}
805 +
806 +/* ********************************** */
807 +
808 +static u_int hash_skb(struct ring_cluster *cluster_ptr,
809 +                     struct sk_buff *skb, u_char recv_packet) {
810 +  u_int idx;
811 +  int displ;
812 +  struct iphdr *ip;
813 +
814 +  if(cluster_ptr->hashing_mode == cluster_round_robin) {
815 +    idx = cluster_ptr->hashing_id++;
816 +  } else {
817 +    /* Per-flow clustering */
818 +    if(skb->len > sizeof(struct iphdr)+sizeof(struct tcphdr)) {
819 +      if(recv_packet)
820 +       displ = 0;
821 +      else
822 +       displ = SKB_DISPLACEMENT;
823 +
824 +      /*
825 +       skb->data+displ
826 +
827 +       Always points to to the IP part of the packet
828 +      */
829 +
830 +      ip = (struct iphdr*)(skb->data+displ);
831 +
832 +      idx = ip->saddr+ip->daddr+ip->protocol;
833 +
834 +      if(ip->protocol == IPPROTO_TCP) {
835 +       struct tcphdr *tcp = (struct tcphdr*)(skb->data+displ
836 +                                             +sizeof(struct iphdr));
837 +       idx += tcp->source+tcp->dest;
838 +      } else if(ip->protocol == IPPROTO_UDP) {
839 +       struct udphdr *udp = (struct udphdr*)(skb->data+displ
840 +                                             +sizeof(struct iphdr));
841 +       idx += udp->source+udp->dest;
842 +      }
843 +    } else
844 +      idx = skb->len;
845 +  }
846 +
847 +  return(idx % cluster_ptr->num_cluster_elements);
848 +}
849 +
850 +/* ********************************** */
851 +
852 +static int skb_ring_handler(struct sk_buff *skb,
853 +                           u_char recv_packet,
854 +                           u_char real_skb /* 1=skb 0=faked skb */) {
855 +  struct sock *skElement;
856 +  int rc = 0;
857 +  struct list_head *ptr;
858 +  struct ring_cluster *cluster_ptr;
859 +
860 +#ifdef PROFILING
861 +  uint64_t rdt = _rdtsc(), rdt1, rdt2;
862 +#endif
863 +
864 +  if((!skb) /* Invalid skb */
865 +     || ((!enable_tx_capture) && (!recv_packet))) {
866 +    /*
867 +      An outgoing packet is about to be sent out
868 +      but we decided not to handle transmitted
869 +      packets.
870 +    */
871 +    return(0);
872 +  }
873 +
874 +#if defined(RING_DEBUG)
875 +  if(0) {
876 +    printk("skb_ring_handler() [len=%d][dev=%s]\n", skb->len,
877 +          skb->dev->name == NULL ? "<NULL>" : skb->dev->name);
878 +  }
879 +#endif
880 +
881 +#ifdef PROFILING
882 +  rdt1 = _rdtsc();
883 +#endif
884 +
885 +  /* [1] Check unclustered sockets */
886 +  for (ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
887 +    struct ring_opt *pfr;
888 +    struct ring_element *entry;
889 +
890 +    entry = list_entry(ptr, struct ring_element, list);
891 +
892 +    read_lock(&ring_mgmt_lock);
893 +    skElement = entry->sk;
894 +    pfr = ring_sk(skElement);
895 +    read_unlock(&ring_mgmt_lock);
896 +
897 +    if((pfr != NULL)
898 +       && (pfr->cluster_id == 0 /* No cluster */)
899 +       && (pfr->ring_slots != NULL)
900 +       && (pfr->ring_netdev == skb->dev)) {
901 +      /* We've found the ring where the packet can be stored */
902 +      read_lock(&ring_mgmt_lock);
903 +      add_skb_to_ring(skb, pfr, recv_packet, real_skb);
904 +      read_unlock(&ring_mgmt_lock);
905 +
906 +      rc = 1; /* Ring found: we've done our job */
907 +    }
908 +  }
909 +
910 +  /* [2] Check socket clusters */
911 +  cluster_ptr = ring_cluster_list;
912 +
913 +  while(cluster_ptr != NULL) {
914 +    struct ring_opt *pfr;
915 +
916 +    if(cluster_ptr->num_cluster_elements > 0) {
917 +      u_int skb_hash = hash_skb(cluster_ptr, skb, recv_packet);
918 +
919 +      read_lock(&ring_mgmt_lock);
920 +      skElement = cluster_ptr->sk[skb_hash];
921 +      read_unlock(&ring_mgmt_lock);
922 +
923 +      if(skElement != NULL) {
924 +       pfr = ring_sk(skElement);
925 +
926 +       if((pfr != NULL)
927 +          && (pfr->ring_slots != NULL)
928 +          && (pfr->ring_netdev == skb->dev)) {
929 +         /* We've found the ring where the packet can be stored */
930 +          read_lock(&ring_mgmt_lock);
931 +         add_skb_to_ring(skb, pfr, recv_packet, real_skb);
932 +          read_unlock(&ring_mgmt_lock);
933 +
934 +         rc = 1; /* Ring found: we've done our job */
935 +       }
936 +      }
937 +    }
938 +
939 +    cluster_ptr = cluster_ptr->next;
940 +  }
941 +
942 +#ifdef PROFILING
943 +  rdt1 = _rdtsc()-rdt1;
944 +#endif
945 +
946 +#ifdef PROFILING
947 +  rdt2 = _rdtsc();
948 +#endif
949 +
950 +  if(transparent_mode) rc = 0;
951 +
952 +  if((rc != 0) && real_skb)
953 +    dev_kfree_skb(skb); /* Free the skb */
954 +
955 +#ifdef PROFILING
956 +  rdt2 = _rdtsc()-rdt2;
957 +  rdt = _rdtsc()-rdt;
958 +
959 +#if defined(RING_DEBUG)
960 +  printk("# cycles: %d [lock costed %d %d%%][free costed %d %d%%]\n",
961 +        (int)rdt, rdt-rdt1,
962 +        (int)((float)((rdt-rdt1)*100)/(float)rdt),
963 +        rdt2,
964 +        (int)((float)(rdt2*100)/(float)rdt));
965 +#endif
966 +#endif
967 +
968 +  return(rc); /*  0 = packet not handled */
969 +}
970 +
971 +/* ********************************** */
972 +
973 +struct sk_buff skb;
974 +
975 +static int buffer_ring_handler(struct net_device *dev,
976 +                              char *data, int len) {
977 +
978 +#if defined(RING_DEBUG)
979 +  printk("buffer_ring_handler: [dev=%s][len=%d]\n",
980 +        dev->name == NULL ? "<NULL>" : dev->name, len);
981 +#endif
982 +
983 +  /* BD - API changed for time keeping */
984 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14))
985 +  skb.dev = dev, skb.len = len, skb.data = data,
986 +    skb.data_len = len, skb.stamp.tv_sec = 0; /* Calculate the time */
987 +#else
988 +  skb.dev = dev, skb.len = len, skb.data = data,
989 +    skb.data_len = len, skb.tstamp.off_sec = 0; /* Calculate the time */
990 +#endif
991 +
992 +  skb_ring_handler(&skb, 1, 0 /* fake skb */);
993 +
994 +  return(0);
995 +}
996 +
997 +/* ********************************** */
998 +
999 +static int ring_create(struct socket *sock, int protocol) {
1000 +  struct sock *sk;
1001 +  struct ring_opt *pfr;
1002 +  int err;
1003 +
1004 +#if defined(RING_DEBUG)
1005 +  printk("RING: ring_create()\n");
1006 +#endif
1007 +
1008 +  /* Are you root, superuser or so ? */
1009 +  if(!capable(CAP_NET_ADMIN))
1010 +    return -EPERM;
1011 +
1012 +  if(sock->type != SOCK_RAW)
1013 +    return -ESOCKTNOSUPPORT;
1014 +
1015 +  if(protocol != htons(ETH_P_ALL))
1016 +    return -EPROTONOSUPPORT;
1017 +
1018 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
1019 +  MOD_INC_USE_COUNT;
1020 +#endif
1021 +
1022 +  err = -ENOMEM;
1023 +
1024 +  // BD: -- broke this out to keep it more simple and clear as to what the
1025 +  // options are.
1026 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1027 +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11))
1028 +  sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL);
1029 +#else
1030 +  // BD: API changed in 2.6.12, ref:
1031 +  // http://svn.clkao.org/svnweb/linux/revision/?rev=28201
1032 +  sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1);
1033 +#endif
1034 +#else
1035 +  /* Kernel 2.4 */
1036 +  sk = sk_alloc(PF_RING, GFP_KERNEL, 1);
1037 +#endif
1038 +
1039 +  if (sk == NULL)
1040 +    goto out;
1041 +
1042 +  sock->ops = &ring_ops;
1043 +  sock_init_data(sock, sk);
1044 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1045 +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11))
1046 +  sk_set_owner(sk, THIS_MODULE);
1047 +#endif
1048 +#endif
1049 +
1050 +  err = -ENOMEM;
1051 +  ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL));
1052 +
1053 +  if (!(pfr = ring_sk(sk))) {
1054 +    sk_free(sk);
1055 +    goto out;
1056 +  }
1057 +  memset(pfr, 0, sizeof(*pfr));
1058 +  init_waitqueue_head(&pfr->ring_slots_waitqueue);
1059 +  pfr->ring_index_lock = RW_LOCK_UNLOCKED;
1060 +  atomic_set(&pfr->num_ring_slots_waiters, 0);
1061 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1062 +  sk->sk_family       = PF_RING;
1063 +  sk->sk_destruct     = ring_sock_destruct;
1064 +#else
1065 +  sk->family          = PF_RING;
1066 +  sk->destruct        = ring_sock_destruct;
1067 +  sk->num             = protocol;
1068 +#endif
1069 +
1070 +  ring_insert(sk);
1071 +
1072 +#if defined(RING_DEBUG)
1073 +  printk("RING: ring_create() - created\n");
1074 +#endif
1075 +
1076 +  return(0);
1077 + out:
1078 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
1079 +  MOD_DEC_USE_COUNT;
1080 +#endif
1081 +  return err;
1082 +}
1083 +
1084 +/* *********************************************** */
1085 +
1086 +static int ring_release(struct socket *sock)
1087 +{
1088 +  struct sock *sk = sock->sk;
1089 +  struct ring_opt *pfr = ring_sk(sk);
1090 +
1091 +  if(!sk)
1092 +    return 0;
1093 +
1094 +#if defined(RING_DEBUG)
1095 +  printk("RING: called ring_release\n");
1096 +#endif
1097 +
1098 +#if defined(RING_DEBUG)
1099 +  printk("RING: ring_release entered\n");
1100 +#endif
1101 +
1102 +  ring_remove(sk);
1103 +
1104 +  sock_orphan(sk);
1105 +  sock->sk = NULL;
1106 +
1107 +  /* Free the ring buffer */
1108 +  if(pfr->ring_memory) {
1109 +    struct page *page, *page_end;
1110 +
1111 +    page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1);
1112 +    for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++)
1113 +      ClearPageReserved(page);
1114 +
1115 +    free_pages(pfr->ring_memory, pfr->order);
1116 +  }
1117 +
1118 +  kfree(pfr);
1119 +  ring_sk(sk) = NULL;
1120 +
1121 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1122 +  skb_queue_purge(&sk->sk_write_queue);
1123 +#endif
1124 +  sock_put(sk);
1125 +
1126 +#if defined(RING_DEBUG)
1127 +  printk("RING: ring_release leaving\n");
1128 +#endif
1129 +
1130 +  return 0;
1131 +}
1132 +
1133 +/* ********************************** */
1134 +/*
1135 + * We create a ring for this socket and bind it to the specified device
1136 + */
1137 +static int packet_ring_bind(struct sock *sk, struct net_device *dev)
1138 +{
1139 +  u_int the_slot_len;
1140 +  u_int32_t tot_mem;
1141 +  struct ring_opt *pfr = ring_sk(sk);
1142 +  struct page *page, *page_end;
1143 +
1144 +  if(!dev) return(-1);
1145 +
1146 +#if defined(RING_DEBUG)
1147 +  printk("RING: packet_ring_bind(%s) called\n", dev->name);
1148 +#endif
1149 +
1150 +  /* **********************************************
1151 +
1152 +  *************************************
1153 +  *                                   *
1154 +  *        FlowSlotInfo               *
1155 +  *                                   *
1156 +  ************************************* <-+
1157 +  *        FlowSlot                   *   |
1158 +  *************************************   |
1159 +  *        FlowSlot                   *   |
1160 +  *************************************   +- num_slots
1161 +  *        FlowSlot                   *   |
1162 +  *************************************   |
1163 +  *        FlowSlot                   *   |
1164 +  ************************************* <-+
1165 +
1166 +  ********************************************** */
1167 +
1168 +  the_slot_len = sizeof(u_char)    /* flowSlot.slot_state */
1169 +#ifdef RING_MAGIC
1170 +    + sizeof(u_char)
1171 +#endif
1172 +    + sizeof(struct pcap_pkthdr)
1173 +    + bucket_len      /* flowSlot.bucket */;
1174 +
1175 +  tot_mem = sizeof(FlowSlotInfo) + num_slots*the_slot_len;
1176 +
1177 +  /*
1178 +    Calculate the value of the order parameter used later.
1179 +    See http://www.linuxjournal.com/article.php?sid=1133
1180 +  */
1181 +  for(pfr->order = 0;(PAGE_SIZE << pfr->order) < tot_mem; pfr->order++)  ;
1182 +
1183 +  /*
1184 +    We now try to allocate the memory as required. If we fail
1185 +    we try to allocate a smaller amount or memory (hence a
1186 +    smaller ring).
1187 +  */
1188 +  while((pfr->ring_memory = __get_free_pages(GFP_ATOMIC, pfr->order)) == 0)
1189 +    if(pfr->order-- == 0)
1190 +      break;
1191 +
1192 +  if(pfr->order == 0) {
1193 +    printk("RING: ERROR not enough memory for ring\n");
1194 +    return(-1);
1195 +  } else {
1196 +    printk("RING: succesfully allocated %lu KB [tot_mem=%d][order=%ld]\n",
1197 +          PAGE_SIZE >> (10 - pfr->order), tot_mem, pfr->order);
1198 +  }
1199 +
1200 +  tot_mem = PAGE_SIZE << pfr->order;
1201 +  memset((char*)pfr->ring_memory, 0, tot_mem);
1202 +
1203 +  /* Now we need to reserve the pages */
1204 +  page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1);
1205 +  for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++)
1206 +    SetPageReserved(page);
1207 +
1208 +  pfr->slots_info = (FlowSlotInfo*)pfr->ring_memory;
1209 +  pfr->ring_slots = (char*)(pfr->ring_memory+sizeof(FlowSlotInfo));
1210 +
1211 +  pfr->slots_info->version     = RING_FLOWSLOT_VERSION;
1212 +  pfr->slots_info->slot_len    = the_slot_len;
1213 +  pfr->slots_info->data_len    = bucket_len;
1214 +  pfr->slots_info->tot_slots   = (tot_mem-sizeof(FlowSlotInfo))/the_slot_len;
1215 +  pfr->slots_info->tot_mem     = tot_mem;
1216 +  pfr->slots_info->sample_rate = sample_rate;
1217 +
1218 +  printk("RING: allocated %d slots [slot_len=%d][tot_mem=%u]\n",
1219 +        pfr->slots_info->tot_slots, pfr->slots_info->slot_len,
1220 +        pfr->slots_info->tot_mem);
1221 +
1222 +#ifdef RING_MAGIC
1223 +  {
1224 +    int i;
1225 +
1226 +    for(i=0; i<pfr->slots_info->tot_slots; i++) {
1227 +      unsigned long idx = i*pfr->slots_info->slot_len;
1228 +      FlowSlot *slot = (FlowSlot*)&pfr->ring_slots[idx];
1229 +      slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0;
1230 +    }
1231 +  }
1232 +#endif
1233 +
1234 +  pfr->insert_page_id = 1, pfr->insert_slot_id = 0;
1235 +
1236 +  /*
1237 +    IMPORTANT
1238 +    Leave this statement here as last one. In fact when
1239 +    the ring_netdev != NULL the socket is ready to be used.
1240 +  */
1241 +  pfr->ring_netdev = dev;
1242 +
1243 +  return(0);
1244 +}
1245 +
1246 +/* ************************************* */
1247 +
1248 +/* Bind to a device */
1249 +static int ring_bind(struct socket *sock,
1250 +                    struct sockaddr *sa, int addr_len)
1251 +{
1252 +  struct sock *sk=sock->sk;
1253 +  struct net_device *dev = NULL;
1254 +
1255 +#if defined(RING_DEBUG)
1256 +  printk("RING: ring_bind() called\n");
1257 +#endif
1258 +
1259 +  /*
1260 +   *   Check legality
1261 +   */
1262 +  if (addr_len != sizeof(struct sockaddr))
1263 +    return -EINVAL;
1264 +  if (sa->sa_family != PF_RING)
1265 +    return -EINVAL;
1266 +
1267 +  /* Safety check: add trailing zero if missing */
1268 +  sa->sa_data[sizeof(sa->sa_data)-1] = '\0';
1269 +
1270 +#if defined(RING_DEBUG)
1271 +  printk("RING: searching device %s\n", sa->sa_data);
1272 +#endif
1273 +
1274 +  if((dev = __dev_get_by_name(sa->sa_data)) == NULL) {
1275 +#if defined(RING_DEBUG)
1276 +    printk("RING: search failed\n");
1277 +#endif
1278 +    return(-EINVAL);
1279 +  } else
1280 +    return(packet_ring_bind(sk, dev));
1281 +}
1282 +
1283 +/* ************************************* */
1284 +
1285 +static int ring_mmap(struct file *file,
1286 +                    struct socket *sock,
1287 +                    struct vm_area_struct *vma)
1288 +{
1289 +  struct sock *sk = sock->sk;
1290 +  struct ring_opt *pfr = ring_sk(sk);
1291 +  unsigned long size, start;
1292 +  u_int pagesToMap;
1293 +  char *ptr;
1294 +
1295 +#if defined(RING_DEBUG)
1296 +  printk("RING: ring_mmap() called\n");
1297 +#endif
1298 +
1299 +  if(pfr->ring_memory == 0) {
1300 +#if defined(RING_DEBUG)
1301 +    printk("RING: ring_mmap() failed: mapping area to an unbound socket\n");
1302 +#endif
1303 +    return -EINVAL;
1304 +  }
1305 +
1306 +  size = (unsigned long)(vma->vm_end-vma->vm_start);
1307 +
1308 +  if(size % PAGE_SIZE) {
1309 +#if defined(RING_DEBUG)
1310 +    printk("RING: ring_mmap() failed: len is not multiple of PAGE_SIZE\n");
1311 +#endif
1312 +    return(-EINVAL);
1313 +  }
1314 +
1315 +  /* if userspace tries to mmap beyond end of our buffer, fail */
1316 +  if(size > pfr->slots_info->tot_mem) {
1317 +#if defined(RING_DEBUG)
1318 +    printk("proc_mmap() failed: area too large [%ld > %d]\n", size, pfr->slots_info->tot_mem);
1319 +#endif
1320 +    return(-EINVAL);
1321 +  }
1322 +
1323 +  pagesToMap = size/PAGE_SIZE;
1324 +
1325 +#if defined(RING_DEBUG)
1326 +  printk("RING: ring_mmap() called. %d pages to map\n", pagesToMap);
1327 +#endif
1328 +
1329 +#if defined(RING_DEBUG)
1330 +  printk("RING: mmap [slot_len=%d][tot_slots=%d] for ring on device %s\n",
1331 +        pfr->slots_info->slot_len, pfr->slots_info->tot_slots,
1332 +        pfr->ring_netdev->name);
1333 +#endif
1334 +
1335 +  /* we do not want to have this area swapped out, lock it */
1336 +  vma->vm_flags |= VM_LOCKED;
1337 +  start = vma->vm_start;
1338 +
1339 +  /* Ring slots start from page 1 (page 0 is reserved for FlowSlotInfo) */
1340 +  ptr = (char*)(start+PAGE_SIZE);
1341 +
1342 +  if(remap_page_range(
1343 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1344 +                     vma,
1345 +#endif
1346 +                     start,
1347 +                     __pa(pfr->ring_memory),
1348 +                     PAGE_SIZE*pagesToMap, vma->vm_page_prot)) {
1349 +#if defined(RING_DEBUG)
1350 +    printk("remap_page_range() failed\n");
1351 +#endif
1352 +    return(-EAGAIN);
1353 +  }
1354 +
1355 +#if defined(RING_DEBUG)
1356 +  printk("proc_mmap(pagesToMap=%d): success.\n", pagesToMap);
1357 +#endif
1358 +
1359 +  return 0;
1360 +}
1361 +
1362 +/* ************************************* */
1363 +
1364 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1365 +static int ring_recvmsg(struct kiocb *iocb, struct socket *sock,
1366 +                       struct msghdr *msg, size_t len, int flags)
1367 +#else
1368 +  static int ring_recvmsg(struct socket *sock, struct msghdr *msg, int len,
1369 +                         int flags, struct scm_cookie *scm)
1370 +#endif
1371 +{
1372 +  FlowSlot* slot;
1373 +  struct ring_opt *pfr = ring_sk(sock->sk);
1374 +  u_int32_t queued_pkts, num_loops = 0;
1375 +
1376 +#if defined(RING_DEBUG)
1377 +  printk("ring_recvmsg called\n");
1378 +#endif
1379 +
1380 +  slot = get_remove_slot(pfr);
1381 +
1382 +  while((queued_pkts = num_queued_pkts(pfr)) < MIN_QUEUED_PKTS) {
1383 +    wait_event_interruptible(pfr->ring_slots_waitqueue, 1);
1384 +
1385 +#if defined(RING_DEBUG)
1386 +    printk("-> ring_recvmsg returning %d [queued_pkts=%d][num_loops=%d]\n",
1387 +          slot->slot_state, queued_pkts, num_loops);
1388 +#endif
1389 +
1390 +    if(queued_pkts > 0) {
1391 +      if(num_loops++ > MAX_QUEUE_LOOPS)
1392 +       break;
1393 +    }
1394 +  }
1395 +
1396 +#if defined(RING_DEBUG)
1397 +  if(slot != NULL)
1398 +    printk("ring_recvmsg is returning [queued_pkts=%d][num_loops=%d]\n",
1399 +          queued_pkts, num_loops);
1400 +#endif
1401 +
1402 +  return(queued_pkts);
1403 +}
1404 +
1405 +/* ************************************* */
1406 +
1407 +unsigned int ring_poll(struct file * file,
1408 +                      struct socket *sock, poll_table *wait)
1409 +{
1410 +  FlowSlot* slot;
1411 +  struct ring_opt *pfr = ring_sk(sock->sk);
1412 +
1413 +#if defined(RING_DEBUG)
1414 +  printk("poll called\n");
1415 +#endif
1416 +
1417 +  slot = get_remove_slot(pfr);
1418 +
1419 +  if((slot != NULL) && (slot->slot_state == 0))
1420 +    poll_wait(file, &pfr->ring_slots_waitqueue, wait);
1421 +
1422 +#if defined(RING_DEBUG)
1423 +  printk("poll returning %d\n", slot->slot_state);
1424 +#endif
1425 +
1426 +  if((slot != NULL) && (slot->slot_state == 1))
1427 +    return(POLLIN | POLLRDNORM);
1428 +  else
1429 +    return(0);
1430 +}
1431 +
1432 +/* ************************************* */
1433 +
1434 +int add_to_cluster_list(struct ring_cluster *el,
1435 +                       struct sock *sock) {
1436 +
1437 +  if(el->num_cluster_elements == CLUSTER_LEN)
1438 +    return(-1); /* Cluster full */
1439 +
1440 +  ring_sk_datatype(ring_sk(sock))->cluster_id = el->cluster_id;
1441 +  el->sk[el->num_cluster_elements] = sock;
1442 +  el->num_cluster_elements++;
1443 +  return(0);
1444 +}
1445 +
1446 +/* ************************************* */
1447 +
1448 +int remove_from_cluster_list(struct ring_cluster *el,
1449 +                            struct sock *sock) {
1450 +  int i, j;
1451 +
1452 +  for(i=0; i<CLUSTER_LEN; i++)
1453 +    if(el->sk[i] == sock) {
1454 +      el->num_cluster_elements--;
1455 +
1456 +      if(el->num_cluster_elements > 0) {
1457 +       /* The cluster contains other elements */
1458 +       for(j=i; j<CLUSTER_LEN-1; j++)
1459 +         el->sk[j] = el->sk[j+1];
1460 +
1461 +       el->sk[CLUSTER_LEN-1] = NULL;
1462 +      } else {
1463 +       /* Empty cluster */
1464 +       memset(el->sk, 0, sizeof(el->sk));
1465 +      }
1466 +
1467 +      return(0);
1468 +    }
1469 +
1470 +  return(-1); /* Not found */
1471 +}
1472 +
1473 +/* ************************************* */
1474 +
1475 +static int remove_from_cluster(struct sock *sock,
1476 +                              struct ring_opt *pfr)
1477 +{
1478 +  struct ring_cluster *el;
1479 +
1480 +#if defined(RING_DEBUG)
1481 +  printk("--> remove_from_cluster(%d)\n", pfr->cluster_id);
1482 +#endif
1483 +
1484 +  if(pfr->cluster_id == 0 /* 0 = No Cluster */)
1485 +    return(0); /* Noting to do */
1486 +
1487 +  el = ring_cluster_list;
1488 +
1489 +  while(el != NULL) {
1490 +    if(el->cluster_id == pfr->cluster_id) {
1491 +      return(remove_from_cluster_list(el, sock));
1492 +    } else
1493 +      el = el->next;
1494 +  }
1495 +
1496 +  return(-EINVAL); /* Not found */
1497 +}
1498 +
1499 +/* ************************************* */
1500 +
1501 +static int add_to_cluster(struct sock *sock,
1502 +                         struct ring_opt *pfr,
1503 +                         u_short cluster_id)
1504 +{
1505 +  struct ring_cluster *el;
1506 +
1507 +#ifndef RING_DEBUG
1508 +  printk("--> add_to_cluster(%d)\n", cluster_id);
1509 +#endif
1510 +
1511 +  if(cluster_id == 0 /* 0 = No Cluster */) return(-EINVAL);
1512 +
1513 +  if(pfr->cluster_id != 0)
1514 +    remove_from_cluster(sock, pfr);
1515 +
1516 +  el = ring_cluster_list;
1517 +
1518 +  while(el != NULL) {
1519 +    if(el->cluster_id == cluster_id) {
1520 +      return(add_to_cluster_list(el, sock));
1521 +    } else
1522 +      el = el->next;
1523 +  }
1524 +
1525 +  /* There's no existing cluster. We need to create one */
1526 +  if((el = kmalloc(sizeof(struct ring_cluster), GFP_KERNEL)) == NULL)
1527 +    return(-ENOMEM);
1528 +
1529 +  el->cluster_id = cluster_id;
1530 +  el->num_cluster_elements = 1;
1531 +  el->hashing_mode = cluster_per_flow; /* Default */
1532 +  el->hashing_id   = 0;
1533 +
1534 +  memset(el->sk, 0, sizeof(el->sk));
1535 +  el->sk[0] = sock;
1536 +  el->next = ring_cluster_list;
1537 +  ring_cluster_list = el;
1538 +  pfr->cluster_id = cluster_id;
1539 +
1540 +  return(0); /* 0 = OK */
1541 +}
1542 +
1543 +/* ************************************* */
1544 +
1545 +/* Code taken/inspired from core/sock.c */
1546 +static int ring_setsockopt(struct socket *sock,
1547 +                          int level, int optname,
1548 +                          char *optval, int optlen)
1549 +{
1550 +  struct ring_opt *pfr = ring_sk(sock->sk);
1551 +  int val, found, ret = 0;
1552 +  u_int cluster_id;
1553 +  char devName[8];
1554 +
1555 +  if((optlen<sizeof(int)) || (pfr == NULL))
1556 +    return(-EINVAL);
1557 +
1558 +  if (get_user(val, (int *)optval))
1559 +    return -EFAULT;
1560 +
1561 +  found = 1;
1562 +
1563 +  switch(optname)
1564 +    {
1565 +    case SO_ATTACH_FILTER:
1566 +      ret = -EINVAL;
1567 +      if (optlen == sizeof(struct sock_fprog)) {
1568 +       unsigned int fsize;
1569 +       struct sock_fprog fprog;
1570 +       struct sk_filter *filter;
1571 +
1572 +       ret = -EFAULT;
1573 +
1574 +       /*
1575 +         NOTE
1576 +
1577 +         Do not call copy_from_user within a held
1578 +         splinlock (e.g. ring_mgmt_lock) as this caused
1579 +         problems when certain debugging was enabled under
1580 +         2.6.5 -- including hard lockups of the machine.
1581 +       */
1582 +       if(copy_from_user(&fprog, optval, sizeof(fprog)))
1583 +         break;
1584 +
1585 +       fsize = sizeof(struct sock_filter) * fprog.len;
1586 +       filter = kmalloc(fsize, GFP_KERNEL);
1587 +
1588 +       if(filter == NULL) {
1589 +         ret = -ENOMEM;
1590 +         break;
1591 +       }
1592 +
1593 +       if(copy_from_user(filter->insns, fprog.filter, fsize))
1594 +         break;
1595 +
1596 +       filter->len = fprog.len;
1597 +
1598 +       if(sk_chk_filter(filter->insns, filter->len) != 0) {
1599 +         /* Bad filter specified */
1600 +         kfree(filter);
1601 +         pfr->bpfFilter = NULL;
1602 +         break;
1603 +       }
1604 +
1605 +       /* get the lock, set the filter, release the lock */
1606 +       write_lock(&ring_mgmt_lock);
1607 +       pfr->bpfFilter = filter;
1608 +       write_unlock(&ring_mgmt_lock);
1609 +      }
1610 +      ret = 0;
1611 +      break;
1612 +
1613 +    case SO_DETACH_FILTER:
1614 +      write_lock(&ring_mgmt_lock);
1615 +      found = 1;
1616 +      if(pfr->bpfFilter != NULL) {
1617 +       kfree(pfr->bpfFilter);
1618 +       pfr->bpfFilter = NULL;
1619 +       write_unlock(&ring_mgmt_lock);
1620 +       break;
1621 +      }
1622 +      ret = -ENONET;
1623 +      break;
1624 +
1625 +    case SO_ADD_TO_CLUSTER:
1626 +      if (optlen!=sizeof(val))
1627 +       return -EINVAL;
1628 +
1629 +      if (copy_from_user(&cluster_id, optval, sizeof(cluster_id)))
1630 +       return -EFAULT;
1631 +
1632 +      write_lock(&ring_mgmt_lock);
1633 +      ret = add_to_cluster(sock->sk, pfr, cluster_id);
1634 +      write_unlock(&ring_mgmt_lock);
1635 +      break;
1636 +
1637 +    case SO_REMOVE_FROM_CLUSTER:
1638 +      write_lock(&ring_mgmt_lock);
1639 +      ret = remove_from_cluster(sock->sk, pfr);
1640 +      write_unlock(&ring_mgmt_lock);
1641 +      break;
1642 +
1643 +    case SO_SET_REFLECTOR:
1644 +      if(optlen >= (sizeof(devName)-1))
1645 +       return -EINVAL;
1646 +
1647 +      if(optlen > 0) {
1648 +       if(copy_from_user(devName, optval, optlen))
1649 +         return -EFAULT;
1650 +      }
1651 +
1652 +      devName[optlen] = '\0';
1653 +
1654 +#if defined(RING_DEBUG)
1655 +      printk("+++ SO_SET_REFLECTOR(%s)\n", devName);
1656 +#endif
1657 +
1658 +      write_lock(&ring_mgmt_lock);
1659 +      pfr->reflector_dev = dev_get_by_name(devName);
1660 +      write_unlock(&ring_mgmt_lock);
1661 +
1662 +#if defined(RING_DEBUG)
1663 +      if(pfr->reflector_dev != NULL)
1664 +       printk("SO_SET_REFLECTOR(%s): succeded\n", devName);
1665 +      else
1666 +       printk("SO_SET_REFLECTOR(%s): device unknown\n", devName);
1667 +#endif
1668 +      break;
1669 +
1670 +    default:
1671 +      found = 0;
1672 +      break;
1673 +    }
1674 +
1675 +  if(found)
1676 +    return(ret);
1677 +  else
1678 +    return(sock_setsockopt(sock, level, optname, optval, optlen));
1679 +}
1680 +
1681 +/* ************************************* */
1682 +
1683 +static int ring_ioctl(struct socket *sock,
1684 +                     unsigned int cmd, unsigned long arg)
1685 +{
1686 +  switch(cmd)
1687 +    {
1688 +    case SIOCGIFFLAGS:
1689 +    case SIOCSIFFLAGS:
1690 +    case SIOCGIFCONF:
1691 +    case SIOCGIFMETRIC:
1692 +    case SIOCSIFMETRIC:
1693 +    case SIOCGIFMEM:
1694 +    case SIOCSIFMEM:
1695 +    case SIOCGIFMTU:
1696 +    case SIOCSIFMTU:
1697 +    case SIOCSIFLINK:
1698 +    case SIOCGIFHWADDR:
1699 +    case SIOCSIFHWADDR:
1700 +    case SIOCSIFMAP:
1701 +    case SIOCGIFMAP:
1702 +    case SIOCSIFSLAVE:
1703 +    case SIOCGIFSLAVE:
1704 +    case SIOCGIFINDEX:
1705 +    case SIOCGIFNAME:
1706 +    case SIOCGIFCOUNT:
1707 +    case SIOCSIFHWBROADCAST:
1708 +      return(dev_ioctl(cmd,(void *) arg));
1709 +
1710 +    default:
1711 +      return -EOPNOTSUPP;
1712 +    }
1713 +
1714 +  return 0;
1715 +}
1716 +
1717 +/* ************************************* */
1718 +
1719 +static struct proto_ops ring_ops = {
1720 +  .family      =       PF_RING,
1721 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1722 +  .owner       =       THIS_MODULE,
1723 +#endif
1724 +
1725 +  /* Operations that make no sense on ring sockets. */
1726 +  .connect     =       sock_no_connect,
1727 +  .socketpair  =       sock_no_socketpair,
1728 +  .accept      =       sock_no_accept,
1729 +  .getname     =       sock_no_getname,
1730 +  .listen      =       sock_no_listen,
1731 +  .shutdown    =       sock_no_shutdown,
1732 +  .sendpage    =       sock_no_sendpage,
1733 +  .sendmsg     =       sock_no_sendmsg,
1734 +  .getsockopt  =       sock_no_getsockopt,
1735 +
1736 +  /* Now the operations that really occur. */
1737 +  .release     =       ring_release,
1738 +  .bind                =       ring_bind,
1739 +  .mmap                =       ring_mmap,
1740 +  .poll                =       ring_poll,
1741 +  .setsockopt  =       ring_setsockopt,
1742 +  .ioctl       =       ring_ioctl,
1743 +  .recvmsg     =       ring_recvmsg,
1744 +};
1745 +
1746 +/* ************************************ */
1747 +
1748 +static struct net_proto_family ring_family_ops = {
1749 +  .family      =       PF_RING,
1750 +  .create      =       ring_create,
1751 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1752 +  .owner       =       THIS_MODULE,
1753 +#endif
1754 +};
1755 +
1756 +// BD: API changed in 2.6.12, ref:
1757 +// http://svn.clkao.org/svnweb/linux/revision/?rev=28201
1758 +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11))
1759 +static struct proto ring_proto = {
1760 +  .name                =       "PF_RING",
1761 +  .owner       =       THIS_MODULE,
1762 +  .obj_size    =       sizeof(struct sock),
1763 +};
1764 +#endif
1765 +
1766 +/* ************************************ */
1767 +
1768 +static void __exit ring_exit(void)
1769 +{
1770 +  struct list_head *ptr;
1771 +  struct ring_element *entry;
1772 +
1773 +  for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
1774 +    entry = list_entry(ptr, struct ring_element, list);
1775 +    kfree(entry);
1776 +  }
1777 +
1778 +  while(ring_cluster_list != NULL) {
1779 +    struct ring_cluster *next = ring_cluster_list->next;
1780 +    kfree(ring_cluster_list);
1781 +    ring_cluster_list = next;
1782 +  }
1783 +
1784 +  set_skb_ring_handler(NULL);
1785 +  set_buffer_ring_handler(NULL);
1786 +  sock_unregister(PF_RING);
1787 +
1788 +  printk("PF_RING shut down.\n");
1789 +}
1790 +
1791 +/* ************************************ */
1792 +
1793 +static int __init ring_init(void)
1794 +{
1795 +  printk("Welcome to PF_RING %s\n(C) 2004-05 L.Deri <deri@ntop.org>\n",
1796 +        RING_VERSION);
1797 +
1798 +  INIT_LIST_HEAD(&ring_table);
1799 +  ring_cluster_list = NULL;
1800 +
1801 +  sock_register(&ring_family_ops);
1802 +
1803 +  set_skb_ring_handler(skb_ring_handler);
1804 +  set_buffer_ring_handler(buffer_ring_handler);
1805 +
1806 +  if(get_buffer_ring_handler() != buffer_ring_handler) {
1807 +    printk("PF_RING: set_buffer_ring_handler FAILED\n");
1808 +
1809 +    set_skb_ring_handler(NULL);
1810 +    set_buffer_ring_handler(NULL);
1811 +    sock_unregister(PF_RING);
1812 +    return -1;
1813 +  } else {
1814 +    printk("PF_RING: bucket length    %d bytes\n", bucket_len);
1815 +    printk("PF_RING: ring slots       %d\n", num_slots);
1816 +    printk("PF_RING: sample rate      %d [1=no sampling]\n", sample_rate);
1817 +    printk("PF_RING: capture TX       %s\n",
1818 +          enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]");
1819 +    printk("PF_RING: transparent mode %s\n",
1820 +          transparent_mode ? "Yes" : "No");
1821 +
1822 +    printk("PF_RING initialized correctly.\n");
1823 +    return 0;
1824 +  }
1825 +}
1826 +
1827 +module_init(ring_init);
1828 +module_exit(ring_exit);
1829 +MODULE_LICENSE("GPL");
1830 +
1831 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
1832 +MODULE_ALIAS_NETPROTO(PF_RING);
1833 +#endif