From 30bf34f05c0f66d9d9221f7ebfe9808024337dd4 Mon Sep 17 00:00:00 2001 From: nbd Date: Sat, 22 Oct 2005 22:03:56 +0000 Subject: [PATCH] add pf_ring patches for kernel and libpcap git-svn-id: svn://svn.openwrt.org/openwrt/trunk/openwrt@2266 3c298f89-4303-0410-b956-a3cf2f4a3e73 --- package/libpcap/Makefile | 2 +- package/libpcap/patches/110-pf_ring.patch | 613 ++ .../linux-2.4/patches/generic/223-pf_ring.patch | 6444 ++++++++++++++++++++ .../linux-2.6/patches/generic/104-pf_ring.patch | 5299 ++++++++++++++++ 4 files changed, 12357 insertions(+), 1 deletion(-) create mode 100644 package/libpcap/patches/110-pf_ring.patch create mode 100644 target/linux/linux-2.4/patches/generic/223-pf_ring.patch create mode 100644 target/linux/linux-2.6/patches/generic/104-pf_ring.patch diff --git a/package/libpcap/Makefile b/package/libpcap/Makefile index fa79e7461..c7dfd9991 100644 --- a/package/libpcap/Makefile +++ b/package/libpcap/Makefile @@ -57,7 +57,7 @@ $(PKG_BUILD_DIR)/.built: rm -rf $(PKG_INSTALL_DIR) mkdir -p $(PKG_INSTALL_DIR) $(MAKE) -C $(PKG_BUILD_DIR) \ - CCOPT="$(TARGET_CFLAGS)" \ + CCOPT="$(TARGET_CFLAGS) -I$(BUILD_DIR)/linux/include" \ DESTDIR="$(PKG_INSTALL_DIR)" \ all install touch $@ diff --git a/package/libpcap/patches/110-pf_ring.patch b/package/libpcap/patches/110-pf_ring.patch new file mode 100644 index 000000000..1d5124fac --- /dev/null +++ b/package/libpcap/patches/110-pf_ring.patch @@ -0,0 +1,613 @@ +diff -urN libpcap.old/pcap-int.h libpcap.dev/pcap-int.h +--- libpcap.old/pcap-int.h 2003-12-15 02:42:24.000000000 +0100 ++++ libpcap.dev/pcap-int.h 2005-10-22 23:20:12.220060500 +0200 +@@ -30,7 +30,7 @@ + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * +- * @(#) $Header: /tcpdump/master/libpcap/pcap-int.h,v 1.55.2.4 2003/12/15 01:42:24 guy Exp $ (LBL) ++ * @(#) $Header: /export/home/ntop/PF_RING/userland/libpcap-0.8.1-ring/pcap-int.h,v 1.2 2004/11/25 09:58:00 deri Exp $ (LBL) + */ + + #ifndef pcap_int_h +@@ -46,6 +46,8 @@ + #include + #endif /* WIN32 */ + ++#define RING /* L.Deri */ ++ + /* + * Savefile + */ +@@ -93,6 +95,57 @@ + #endif + }; + ++/* **************************** */ ++ ++#ifdef RING ++ ++#include ++#include ++#include ++#include ++ ++#define PAGE_SIZE 4096 ++ ++#define HAVE_PCAP ++#include ++#endif ++ ++#ifdef RING ++ ++#define E1000_RXD_STAT_DD 0x01 /* Descriptor Done */ ++ ++struct e1000_rx_desc { ++ u_int64_t buffer_addr; /* Address of the descriptor's data buffer */ ++ u_int16_t length; /* Length of data DMAed into data buffer */ ++ u_int16_t csum; /* Packet checksum */ ++ u_int8_t status; /* Descriptor status */ ++ u_int8_t errors; /* Descriptor Errors */ ++ u_int16_t special; ++}; ++ ++/* Transmit Descriptor */ ++struct e1000_tx_desc { ++ u_int64_t buffer_addr; /* Address of the descriptor's data buffer */ ++ union { ++ u_int32_t data; ++ struct { ++ u_int16_t length; /* Data buffer length */ ++ u_int8_t cso; /* Checksum offset */ ++ u_int8_t cmd; /* Descriptor control */ ++ } flags; ++ } lower; ++ union { ++ u_int32_t data; ++ struct { ++ u_int8_t status; /* Descriptor status */ ++ u_int8_t css; /* Checksum start */ ++ u_int16_t special; ++ } fields; ++ } upper; ++}; ++ ++#endif ++ + struct pcap { + #ifdef WIN32 + ADAPTER *adapter; +@@ -121,6 +174,14 @@ + u_char *bp; + int cc; + ++#ifdef RING ++ /* PF_RING */ ++ char *ring_buffer, *ring_slots; ++ int ring_fd; ++ FlowSlotInfo *slots_info; ++ u_int page_id, slot_id, pkts_per_page; ++ u_int poll_sleep; ++#endif + /* + * Place holder for pcap_next(). + */ +diff -urN libpcap.old/pcap-linux.c libpcap.dev/pcap-linux.c +--- libpcap.old/pcap-linux.c 2003-11-21 11:20:46.000000000 +0100 ++++ libpcap.dev/pcap-linux.c 2005-10-22 23:43:59.726120250 +0200 +@@ -27,7 +27,7 @@ + + #ifndef lint + static const char rcsid[] _U_ = +- "@(#) $Header: /tcpdump/master/libpcap/pcap-linux.c,v 1.98.2.4 2003/11/21 10:20:46 guy Exp $ (LBL)"; ++ "@(#) $Header: /export/home/ntop/PF_RING/userland/libpcap-0.8.1-ring/pcap-linux.c,v 1.2 2004/11/25 09:58:00 deri Exp $ (LBL)"; + #endif + + /* +@@ -83,7 +83,7 @@ + #ifdef HAVE_DAG_API + #include "pcap-dag.h" + #endif /* HAVE_DAG_API */ +- ++ + #include + #include + #include +@@ -217,6 +217,83 @@ + = { 1, &total_insn }; + #endif + ++#define RING /* L.Deri */ ++#define SAFE_RING_MODE /* ++ Copy the bucket in order to avoid kernel ++ crash if the application faults ++ */ ++ ++#ifdef RING ++unsigned char *write_register; ++static struct pcap_stat ringStats; ++u_long numPollCalls = 0, numReadCalls = 0; ++ ++#define POLL_SLEEP_STEP 10 /* ns = 0.1 ms */ ++#define POLL_SLEEP_MIN POLL_SLEEP_STEP ++#define POLL_SLEEP_MAX 1000 /* ns */ ++#define POLL_QUEUE_MIN_LEN 500 /* # packets */ ++ ++#ifdef SAFE_RING_MODE ++static char staticBucket[2048]; ++#endif ++ ++ ++/* ******************************* */ ++ ++int pcap_set_cluster(pcap_t *handle, u_int clusterId) { ++ return(handle->ring_fd ? setsockopt(handle->ring_fd, 0, SO_ADD_TO_CLUSTER, ++ &clusterId, sizeof(clusterId)): -1); ++} ++ ++/* ******************************* */ ++ ++int pcap_remove_from_cluster(pcap_t *handle) { ++ return(handle->ring_fd ? ++ setsockopt(handle->ring_fd, 0, SO_REMOVE_FROM_CLUSTER, NULL, 0) : -1); ++} ++ ++/* ******************************* */ ++ ++int pcap_set_reflector(pcap_t *handle, char *reflectorDevice) { ++ return(handle->ring_fd ? ++ setsockopt(handle->ring_fd, 0, SO_SET_REFLECTOR, ++ &reflectorDevice, strlen(reflectorDevice)) : -1); ++} ++ ++/* ******************************* */ ++ ++static int set_if_promisc(const char *device, int set_promisc) { ++ int sock_fd; ++ struct ifreq ifr; ++ ++ if(device == NULL) return(-3); ++ ++ sock_fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); ++ if(sock_fd <= 0) return(-1); ++ ++ memset(&ifr, 0, sizeof(ifr)); ++ strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name)); ++ if(ioctl(sock_fd, SIOCGIFFLAGS, &ifr) == -1) { ++ close(sock_fd); ++ return(-2); ++ } ++ ++ if(set_promisc) { ++ if((ifr.ifr_flags & IFF_PROMISC) == 0) ifr.ifr_flags |= IFF_PROMISC; ++ } else { ++ /* Remove promisc */ ++ if((ifr.ifr_flags & IFF_PROMISC) != 0) ifr.ifr_flags &= ~IFF_PROMISC; ++ } ++ ++ if(ioctl(sock_fd, SIOCSIFFLAGS, &ifr) == -1) ++ return(-1); ++ ++ close(sock_fd); ++ return(0); ++} ++ ++#endif ++ + /* + * Get a handle for a live capture from the given device. You can + * pass NULL as device to get all packages (without link level +@@ -258,6 +335,138 @@ + handle->snapshot = snaplen; + handle->md.timeout = to_ms; + ++#ifdef RING ++ handle->ring_fd = handle->fd = socket(PF_RING, SOCK_RAW, htons(ETH_P_ALL)); ++ ++ printf("Open RING [fd=%d]\n", handle->ring_fd); ++ ++ if(handle->ring_fd > 0) { ++ struct sockaddr sa; ++ int rc; ++ u_int memSlotsLen; ++ ++ err = 0; ++ sa.sa_family = PF_RING; ++ snprintf(sa.sa_data, sizeof(sa.sa_data), "%s", device); ++ rc = bind(handle->ring_fd, (struct sockaddr *)&sa, sizeof(sa)); ++ ++ if(rc == 0) { ++ ++ ++ handle->md.device = strdup(device); ++ handle->ring_buffer = (char *)mmap(NULL, PAGE_SIZE, ++ PROT_READ|PROT_WRITE, ++ MAP_SHARED, ++ handle->ring_fd, 0); ++ ++ if(handle->ring_buffer == MAP_FAILED) { ++ sprintf(ebuf, "mmap() failed"); ++ return (NULL); ++ } ++ ++ handle->slots_info = (FlowSlotInfo *)handle->ring_buffer; ++ if(handle->slots_info->version != RING_FLOWSLOT_VERSION) { ++ snprintf(ebuf, PCAP_ERRBUF_SIZE, "Wrong RING version: " ++ "kernel is %i, libpcap was compiled with %i\n", ++ handle->slots_info->version, RING_FLOWSLOT_VERSION); ++ return (NULL); ++ } ++ memSlotsLen = handle->slots_info->tot_mem; ++ munmap(handle->ring_buffer, PAGE_SIZE); ++ ++ handle->ring_buffer = (char *)mmap(NULL, memSlotsLen, ++ PROT_READ|PROT_WRITE, ++ MAP_SHARED, handle->ring_fd, 0); ++ ++ if(handle->ring_buffer == MAP_FAILED) { ++ sprintf(ebuf, "mmap() failed"); ++ return (NULL); ++ } ++ ++ handle->slots_info = (FlowSlotInfo *)handle->ring_buffer; ++ handle->ring_slots = (char *)(handle->ring_buffer+sizeof(FlowSlotInfo)); ++ ++ /* Safety check */ ++ if(handle->slots_info->remove_idx >= handle->slots_info->tot_slots) ++ handle->slots_info->remove_idx = 0; ++ ++ handle->page_id = PAGE_SIZE, handle->slot_id = 0, ++ handle->pkts_per_page = 0; ++ ++ if(0) { ++ int i; ++ ++ for(i=0; islots_info->tot_slots; i++) { ++ unsigned long idx = i*handle->slots_info->slot_len; ++ FlowSlot *slot = (FlowSlot*)&handle->ring_slots[idx]; ++ ++ printf("RING: Setting RING_MAGIC_VALUE into slot %d [displacement=%lu]\n", i, idx); ++ slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0; ++ printf("RING: slot[%d]: magic=%d, slot_state=%d\n", ++ slot->magic, slot->slot_state); ++ } ++ } ++ ++ ++ /* Set defaults */ ++ handle->linktype = DLT_EN10MB; ++ handle->offset = 2; ++ ++ printf("RING (%s): tot_slots=%d/slot_len=%d/" ++ "insertIdx=%d/remove_idx=%d/dropped=%d\n", ++ device, ++ handle->slots_info->tot_slots, ++ handle->slots_info->slot_len, ++ handle->slots_info->insert_idx, ++ handle->slots_info->remove_idx, ++ handle->slots_info->tot_lost); ++ ++ ringStats.ps_recv = handle->slots_info->tot_read; ++ ringStats.ps_drop = handle->slots_info->tot_lost; ++ ++ if(promisc) { ++ struct ifreq ifr; ++ ++ err = 0; ++ memset(&ifr, 0, sizeof(ifr)); ++ strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name)); ++ if (ioctl(handle->fd, SIOCGIFFLAGS, &ifr) == -1) { ++ snprintf(ebuf, PCAP_ERRBUF_SIZE, ++ "ioctl: %s", pcap_strerror(errno)); ++ err = 1; ++ } ++ ++ if(err == 0) { ++ if ((ifr.ifr_flags & IFF_PROMISC) == 0) { ++ /* ++ * Promiscuous mode isn't currently on, ++ * so turn it on, and remember that ++ * we should turn it off when the ++ * pcap_t is closed. ++ */ ++ ++ ifr.ifr_flags |= IFF_PROMISC; ++ if (ioctl(handle->fd, SIOCSIFFLAGS, &ifr) == -1) { ++ snprintf(ebuf, PCAP_ERRBUF_SIZE, ++ "ioctl: %s", pcap_strerror(errno)); ++ err = 1; ++ } ++ } ++ ++ if(err == 0) ++ handle->md.clear_promisc = 1; ++ } ++ } ++ ++ if(err == 0) ++ goto open_open_live_final; ++ } ++ ++ /* Don't put 'else' above... */ ++ close(handle->ring_fd); ++ /* Continue without ring support */ ++ } ++#endif + /* + * NULL and "any" are special devices which give us the hint to + * monitor all devices. +@@ -397,6 +606,9 @@ + return NULL; + } + ++#ifdef RING ++ open_open_live_final: ++#endif + /* + * "handle->fd" is a socket, so "select()" and "poll()" + * should work on it. +@@ -449,6 +661,120 @@ + int packet_len, caplen; + struct pcap_pkthdr pcap_header; + ++#ifdef RING ++ if(handle->ring_buffer != NULL) { ++ u_int idx, numRuns = 0, ptrAddr; ++ FlowSlot *slot; ++ ++ slot = (FlowSlot*)&handle->ring_slots[handle->slots_info->remove_idx*handle->slots_info->slot_len]; ++ ++ while(1) { ++ u_int32_t queuedPkts; ++ ++ if(handle->slots_info->tot_insert >= handle->slots_info->tot_read) ++ queuedPkts = handle->slots_info->tot_insert - handle->slots_info->tot_read; ++ else ++ queuedPkts = handle->slots_info->tot_slots + handle->slots_info->tot_insert - handle->slots_info->tot_read; ++ ++ if(queuedPkts && (slot->slot_state == 1)) { ++ char *bucket = &slot->bucket; ++ ++#ifdef RING_MAGIC ++ if(slot->magic != RING_MAGIC_VALUE) { ++ printf("==>> Bad Magic [remove_idx=%u][insert_idx=%u][ptrAddr=%u]\n", ++ handle->slots_info->remove_idx, ++ handle->slots_info->insert_idx, ++ ptrAddr); ++ slot->magic = RING_MAGIC_VALUE; ++ } ++#endif ++ ++ ++ handle->md.stat.ps_recv++; ++ ++#ifdef SAFE_RING_MODE ++ { ++ struct pcap_pkthdr *hdr = (struct pcap_pkthdr*)bucket; ++ int bktLen = hdr->caplen; ++ ++ if(bktLen > sizeof(staticBucket)) ++ bktLen = sizeof(staticBucket); ++ ++ memcpy(staticBucket, &bucket[sizeof(struct pcap_pkthdr)], bktLen); ++ ++#ifdef RING_DEBUG ++ printf("==>> [remove_idx=%u][insert_idx=%u][ptrAddr=%u]\n", ++ handle->slots_info->remove_idx, ++ handle->slots_info->insert_idx, ++ ptrAddr); ++#endif ++ ++ callback(userdata, hdr, staticBucket); ++ } ++#else ++ callback(userdata, ++ (const struct pcap_pkthdr*)bucket, ++ (const u_char*)&bucket[sizeof(struct pcap_pkthdr)]); ++#endif ++ ++ if(handle->slots_info->remove_idx >= (handle->slots_info->tot_slots-1)) { ++ handle->slots_info->remove_idx = 0; ++ handle->page_id = PAGE_SIZE, handle->slot_id = 0, handle->pkts_per_page = 0; ++ } else { ++ handle->slots_info->remove_idx++; ++ handle->pkts_per_page++, handle->slot_id += handle->slots_info->slot_len; ++ } ++ ++ handle->slots_info->tot_read++; ++ slot->slot_state = 0; ++ ++ return(1); ++ } else { ++ struct pollfd pfd; ++ int rc; ++ ++ /* Sleep when nothing is happening */ ++ pfd.fd = handle->ring_fd; ++ pfd.events = POLLIN|POLLERR; ++ pfd.revents = 0; ++ ++#ifdef RING_DEBUG ++ printf("==>> poll [remove_idx=%u][insert_idx=%u][loss=%d][queuedPkts=%u]" ++ "[slot_state=%d][tot_insert=%u][tot_read=%u]\n", ++ handle->slots_info->remove_idx, ++ handle->slots_info->insert_idx, ++ handle->slots_info->tot_lost, ++ queuedPkts, slot->slot_state, ++ handle->slots_info->tot_insert, ++ handle->slots_info->tot_read); ++ #endif ++ ++#ifdef RING_DEBUG ++ printf("==>> poll @ [remove_idx=%u][slot_id=%u]\n", handle->slots_info->remove_idx, handle->slot_id); ++#endif ++ errno = 0; ++ rc = poll(&pfd, 1, -1); ++#ifdef RING_DEBUG ++ printf("==>> poll returned %d [%s][errno=%d][break_loop=%d]\n", ++ rc, strerror(errno), errno, handle->break_loop); ++#endif ++ numPollCalls++; ++ ++ if(rc == -1) { ++ if(errno == EINTR) { ++ if(handle->break_loop) { ++ handle->break_loop = 0; ++ return(-2); ++ } else ++ return(0); ++ } else ++ return(-1); ++ } ++ } ++ } /* while() */ ++ } ++#endif ++ + #ifdef HAVE_PF_PACKET_SOCKETS + /* + * If this is a cooked device, leave extra room for a +@@ -688,6 +1014,22 @@ + socklen_t len = sizeof (struct tpacket_stats); + #endif + ++#ifdef RING ++ if(handle->ring_fd > 0) { ++ stats->ps_recv = handle->slots_info->tot_read-ringStats.ps_recv; ++ stats->ps_drop = handle->slots_info->tot_lost-ringStats.ps_drop; ++ ++ printf("RING: numPollCalls=%d [%.1f packets/call]\n", ++ numPollCalls, (float)stats->ps_recv/(float)numPollCalls); ++ printf("RING: [tot_pkts=%u][tot_read=%u][tot_lost=%u]\n", ++ handle->slots_info->tot_pkts, ++ handle->slots_info->tot_read, ++ handle->slots_info->tot_lost); ++ ++ return(0); ++ } ++#endif ++ + #ifdef HAVE_TPACKET_STATS + /* + * Try to get the packet counts from the kernel. +@@ -879,6 +1221,11 @@ + } + } + ++ ++#ifdef RING ++ if(handle->ring_fd <= 0) can_filter_in_kernel = 0; ++#endif ++ + if (can_filter_in_kernel) { + if ((err = set_kernel_filter(handle, &fcode)) == 0) + { +@@ -1348,7 +1695,7 @@ + memset(&mr, 0, sizeof(mr)); + mr.mr_ifindex = device_id; + mr.mr_type = PACKET_MR_PROMISC; +- if (setsockopt(sock_fd, SOL_PACKET, ++ if (setsockopt(sock_fd, 0 /* SOL_PACKET */, + PACKET_ADD_MEMBERSHIP, &mr, sizeof(mr)) == -1) + { + snprintf(ebuf, PCAP_ERRBUF_SIZE, +@@ -1425,10 +1772,11 @@ + + /* Any pending errors, e.g., network is down? */ + +- if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &errlen) == -1) { +- snprintf(ebuf, PCAP_ERRBUF_SIZE, +- "getsockopt: %s", pcap_strerror(errno)); +- return -2; ++ if ((getsockopt(fd, PF_RING, SO_ERROR, &err, &errlen) == -1) ++ && (getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &errlen) == -1)) { ++ snprintf(ebuf, PCAP_ERRBUF_SIZE, ++ "getsockopt: %s", pcap_strerror(errno)); ++ return -2; + } + + if (err > 0) { +@@ -1482,6 +1830,13 @@ + struct pcap *p, *prevp; + struct ifreq ifr; + ++#ifdef RING ++ if(handle->ring_buffer != NULL) { ++ munmap(handle->ring_buffer, handle->slots_info->tot_mem); ++ handle->ring_buffer = NULL; ++ } ++#endif ++ + if (handle->md.clear_promisc) { + /* + * We put the interface into promiscuous mode; take +@@ -1698,11 +2053,11 @@ + } + + /* Any pending errors, e.g., network is down? */ +- +- if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &errlen) == -1) { +- snprintf(ebuf, PCAP_ERRBUF_SIZE, +- "getsockopt: %s", pcap_strerror(errno)); +- return -1; ++ if((getsockopt(fd, PF_RING, SO_ERROR, &err, &errlen) == -1) ++ && (getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &errlen) == -1)) { ++ snprintf(ebuf, PCAP_ERRBUF_SIZE, ++ "getsockopt: %s", pcap_strerror(errno)); ++ return -1; + } + + if (err > 0) { +@@ -1924,8 +2279,11 @@ + * the filtering done in userland even if it could have been + * done in the kernel. + */ +- if (setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER, +- &total_fcode, sizeof(total_fcode)) == 0) { ++ printf("pcap[setsockopt(%d)]\n", 0); ++ if (setsockopt(handle->fd, 0 /* SOL_SOCKET */, ++ SO_ATTACH_FILTER, ++ &total_fcode, ++ sizeof(total_fcode)) == 0) { + char drain[1]; + + /* +@@ -1933,6 +2291,9 @@ + */ + total_filter_on = 1; + ++#ifdef RING ++ if(!handle->ring_fd) { ++#endif + /* + * Save the socket's current mode, and put it in + * non-blocking mode; we drain it by reading packets +@@ -1955,12 +2316,15 @@ + return -2; + } + } +- } ++#ifdef RING ++ } ++#endif ++} + + /* + * Now attach the new filter. + */ +- ret = setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER, ++ ret = setsockopt(handle->fd, 0 /* SOL_SOCKET */, SO_ATTACH_FILTER, + fcode, sizeof(*fcode)); + if (ret == -1 && total_filter_on) { + /* +@@ -1993,7 +2357,8 @@ + /* setsockopt() barfs unless it get a dummy parameter */ + int dummy; + +- return setsockopt(handle->fd, SOL_SOCKET, SO_DETACH_FILTER, +- &dummy, sizeof(dummy)); ++ return setsockopt(handle->fd, handle->ring_fd > 0 ? PF_RING : SOL_SOCKET, ++ SO_DETACH_FILTER, ++ &dummy, sizeof(dummy)); + } + #endif diff --git a/target/linux/linux-2.4/patches/generic/223-pf_ring.patch b/target/linux/linux-2.4/patches/generic/223-pf_ring.patch new file mode 100644 index 000000000..1235e1044 --- /dev/null +++ b/target/linux/linux-2.4/patches/generic/223-pf_ring.patch @@ -0,0 +1,6444 @@ +diff --unified --recursive --new-file linux-2.4.30/include/linux/ring.h linux-2.4.30-1-686-smp-ring3/include/linux/ring.h +--- linux-2.4.30/include/linux/ring.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/include/linux/ring.h 2005-10-22 23:08:27.388011250 +0200 +@@ -0,0 +1,108 @@ ++/* ++ * Definitions for packet ring ++ * ++ * 2004 - Luca Deri ++ */ ++#ifndef __RING_H ++#define __RING_H ++ ++ ++#define INCLUDE_MAC_INFO ++ ++#ifdef INCLUDE_MAC_INFO ++#define SKB_DISPLACEMENT 14 /* Include MAC address information */ ++#else ++#define SKB_DISPLACEMENT 0 /* Do NOT include MAC address information */ ++#endif ++ ++#define RING_MAGIC ++#define RING_MAGIC_VALUE 0x88 ++#define RING_FLOWSLOT_VERSION 5 ++#define RING_VERSION "3.0" ++ ++#define SO_ADD_TO_CLUSTER 99 ++#define SO_REMOVE_FROM_CLUSTER 100 ++#define SO_SET_REFLECTOR 101 ++ ++/* *********************************** */ ++ ++#ifndef HAVE_PCAP ++struct pcap_pkthdr { ++ struct timeval ts; /* time stamp */ ++ u_int32_t caplen; /* length of portion present */ ++ u_int32_t len; /* length this packet (off wire) */ ++}; ++#endif ++ ++/* *********************************** */ ++ ++enum cluster_type { ++ cluster_per_flow = 0, ++ cluster_round_robin ++}; ++ ++/* *********************************** */ ++ ++#define RING_MIN_SLOT_SIZE (60+sizeof(struct pcap_pkthdr)) ++#define RING_MAX_SLOT_SIZE (1514+sizeof(struct pcap_pkthdr)) ++ ++/* *********************************** */ ++ ++typedef struct flowSlotInfo { ++ u_int16_t version, sample_rate; ++ u_int32_t tot_slots, slot_len, tot_mem; ++ ++ u_int64_t tot_pkts, tot_lost; ++ u_int64_t tot_insert, tot_read; ++ u_int16_t insert_idx; ++ u_int16_t remove_idx; ++} FlowSlotInfo; ++ ++/* *********************************** */ ++ ++typedef struct flowSlot { ++#ifdef RING_MAGIC ++ u_char magic; /* It must alwasy be zero */ ++#endif ++ u_char slot_state; /* 0=empty, 1=full */ ++ u_char bucket; /* bucket[bucketLen] */ ++} FlowSlot; ++ ++/* *********************************** */ ++ ++#ifdef __KERNEL__ ++ ++FlowSlotInfo* getRingPtr(void); ++int allocateRing(char *deviceName, u_int numSlots, ++ u_int bucketLen, u_int sampleRate); ++unsigned int pollRing(struct file *fp, struct poll_table_struct * wait); ++void deallocateRing(void); ++ ++/* ************************* */ ++ ++typedef int (*handle_ring_skb)(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb); ++extern handle_ring_skb get_skb_ring_handler(void); ++extern void set_skb_ring_handler(handle_ring_skb the_handler); ++extern void do_skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb); ++ ++typedef int (*handle_ring_buffer)(struct net_device *dev, ++ char *data, int len); ++extern handle_ring_buffer get_buffer_ring_handler(void); ++extern void set_buffer_ring_handler(handle_ring_buffer the_handler); ++extern int do_buffer_ring_handler(struct net_device *dev, ++ char *data, int len); ++#endif /* __KERNEL__ */ ++ ++/* *********************************** */ ++ ++#define PF_RING 27 /* Packet Ring */ ++#define SOCK_RING PF_RING ++ ++/* ioctl() */ ++#define SIORINGPOLL 0x8888 ++ ++/* *********************************** */ ++ ++#endif /* __RING_H */ +diff --unified --recursive --new-file linux-2.4.30/include/net/sock.h linux-2.4.30-1-686-smp-ring3/include/net/sock.h +--- linux-2.4.30/include/net/sock.h 2004-11-17 12:54:22.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/include/net/sock.h 2005-10-22 23:08:27.976048000 +0200 +@@ -699,6 +699,9 @@ + #if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE) + struct packet_opt *af_packet; + #endif ++#if defined(CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ struct ring_opt *pf_ring; ++#endif + #if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE) + x25_cb *x25; + #endif +diff --unified --recursive --new-file linux-2.4.30/include/net/sock.h.ORG linux-2.4.30-1-686-smp-ring3/include/net/sock.h.ORG +--- linux-2.4.30/include/net/sock.h.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/include/net/sock.h.ORG 2005-10-22 23:08:27.940045750 +0200 +@@ -0,0 +1,1400 @@ ++/* ++ * INET An implementation of the TCP/IP protocol suite for the LINUX ++ * operating system. INET is implemented using the BSD Socket ++ * interface as the means of communication with the user level. ++ * ++ * Definitions for the AF_INET socket handler. ++ * ++ * Version: @(#)sock.h 1.0.4 05/13/93 ++ * ++ * Authors: Ross Biro, ++ * Fred N. van Kempen, ++ * Corey Minyard ++ * Florian La Roche ++ * ++ * Fixes: ++ * Alan Cox : Volatiles in skbuff pointers. See ++ * skbuff comments. May be overdone, ++ * better to prove they can be removed ++ * than the reverse. ++ * Alan Cox : Added a zapped field for tcp to note ++ * a socket is reset and must stay shut up ++ * Alan Cox : New fields for options ++ * Pauline Middelink : identd support ++ * Alan Cox : Eliminate low level recv/recvfrom ++ * David S. Miller : New socket lookup architecture. ++ * Steve Whitehouse: Default routines for sock_ops ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#ifndef _SOCK_H ++#define _SOCK_H ++ ++#include ++#include ++#include ++#include /* struct sockaddr_in */ ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++#include /* struct sockaddr_in6 */ ++#include /* dest_cache, inet6_options */ ++#include ++#include /* struct ipv6_mc_socklist */ ++#endif ++ ++#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) ++#include ++#endif ++#include /* struct tcphdr */ ++#if defined(CONFIG_IP_SCTP) || defined (CONFIG_IP_SCTP_MODULE) ++#include /* struct sctp_opt */ ++#endif ++ ++#include ++#include /* struct sk_buff */ ++#include /* struct inet_protocol */ ++#if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE) ++#include ++#endif ++#if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE) ++#include ++#endif ++ ++#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) ++#include ++#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) ++#include ++#endif ++#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) ++#include ++#endif ++#endif ++ ++#if defined(CONFIG_PPPOE) || defined(CONFIG_PPPOE_MODULE) ++#include ++#include /* struct ppp_channel */ ++#endif ++ ++#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) ++#if defined(CONFIG_SPX) || defined(CONFIG_SPX_MODULE) ++#include ++#else ++#include ++#endif /* CONFIG_SPX */ ++#endif /* CONFIG_IPX */ ++ ++#if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) ++#include ++#endif ++ ++#if defined(CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE) ++#include ++#endif ++ ++#if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE) ++#include ++#endif ++ ++#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) ++struct atm_vcc; ++#endif ++ ++#ifdef CONFIG_FILTER ++#include ++#endif ++ ++#include ++#include ++ ++ ++/* The AF_UNIX specific socket options */ ++struct unix_opt { ++ struct unix_address *addr; ++ struct dentry * dentry; ++ struct vfsmount * mnt; ++ struct semaphore readsem; ++ struct sock * other; ++ struct sock ** list; ++ struct sock * gc_tree; ++ atomic_t inflight; ++ rwlock_t lock; ++ wait_queue_head_t peer_wait; ++}; ++ ++ ++/* Once the IPX ncpd patches are in these are going into protinfo. */ ++#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) ++struct ipx_opt { ++ ipx_address dest_addr; ++ ipx_interface *intrfc; ++ unsigned short port; ++#ifdef CONFIG_IPX_INTERN ++ unsigned char node[IPX_NODE_LEN]; ++#endif ++ unsigned short type; ++/* ++ * To handle special ncp connection-handling sockets for mars_nwe, ++ * the connection number must be stored in the socket. ++ */ ++ unsigned short ipx_ncp_conn; ++}; ++#endif ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++struct ipv6_pinfo { ++ struct in6_addr saddr; ++ struct in6_addr rcv_saddr; ++ struct in6_addr daddr; ++ struct in6_addr *daddr_cache; ++ ++ __u32 flow_label; ++ __u32 frag_size; ++ int hop_limit; ++ int mcast_hops; ++ int mcast_oif; ++ ++ /* pktoption flags */ ++ union { ++ struct { ++ __u8 srcrt:2, ++ rxinfo:1, ++ rxhlim:1, ++ hopopts:1, ++ dstopts:1, ++ authhdr:1, ++ rxflow:1; ++ } bits; ++ __u8 all; ++ } rxopt; ++ ++ /* sockopt flags */ ++ __u8 mc_loop:1, ++ recverr:1, ++ sndflow:1, ++ pmtudisc:2, ++ ipv6only:1; ++ ++ struct ipv6_mc_socklist *ipv6_mc_list; ++ struct ipv6_ac_socklist *ipv6_ac_list; ++ struct ipv6_fl_socklist *ipv6_fl_list; ++ __u32 dst_cookie; ++ ++ struct ipv6_txoptions *opt; ++ struct sk_buff *pktoptions; ++}; ++ ++struct raw6_opt { ++ __u32 checksum; /* perform checksum */ ++ __u32 offset; /* checksum offset */ ++ ++ struct icmp6_filter filter; ++}; ++ ++#define __ipv6_only_sock(sk) ((sk)->net_pinfo.af_inet6.ipv6only) ++#define ipv6_only_sock(sk) ((sk)->family == PF_INET6 && \ ++ (sk)->net_pinfo.af_inet6.ipv6only) ++#else ++#define __ipv6_only_sock(sk) 0 ++#define ipv6_only_sock(sk) 0 ++#endif /* IPV6 */ ++ ++#if defined(CONFIG_INET) || defined(CONFIG_INET_MODULE) ++struct raw_opt { ++ struct icmp_filter filter; ++}; ++#endif ++ ++#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) ++struct inet_opt ++{ ++ int ttl; /* TTL setting */ ++ int tos; /* TOS */ ++ unsigned cmsg_flags; ++ struct ip_options *opt; ++ unsigned char hdrincl; /* Include headers ? */ ++ __u8 mc_ttl; /* Multicasting TTL */ ++ __u8 mc_loop; /* Loopback */ ++ unsigned recverr : 1, ++ freebind : 1; ++ __u16 id; /* ID counter for DF pkts */ ++ __u8 pmtudisc; ++ int mc_index; /* Multicast device index */ ++ __u32 mc_addr; ++ struct ip_mc_socklist *mc_list; /* Group array */ ++}; ++#endif ++ ++#if defined(CONFIG_PPPOE) || defined (CONFIG_PPPOE_MODULE) ++struct pppoe_opt ++{ ++ struct net_device *dev; /* device associated with socket*/ ++ struct pppoe_addr pa; /* what this socket is bound to*/ ++ struct sockaddr_pppox relay; /* what socket data will be ++ relayed to (PPPoE relaying) */ ++}; ++ ++struct pppox_opt ++{ ++ struct ppp_channel chan; ++ struct sock *sk; ++ struct pppox_opt *next; /* for hash table */ ++ union { ++ struct pppoe_opt pppoe; ++ } proto; ++}; ++#define pppoe_dev proto.pppoe.dev ++#define pppoe_pa proto.pppoe.pa ++#define pppoe_relay proto.pppoe.relay ++#endif ++ ++/* This defines a selective acknowledgement block. */ ++struct tcp_sack_block { ++ __u32 start_seq; ++ __u32 end_seq; ++}; ++ ++enum tcp_congestion_algo { ++ TCP_RENO=0, ++ TCP_VEGAS, ++ TCP_WESTWOOD, ++ TCP_BIC, ++}; ++ ++struct tcp_opt { ++ int tcp_header_len; /* Bytes of tcp header to send */ ++ ++/* ++ * Header prediction flags ++ * 0x5?10 << 16 + snd_wnd in net byte order ++ */ ++ __u32 pred_flags; ++ ++/* ++ * RFC793 variables by their proper names. This means you can ++ * read the code and the spec side by side (and laugh ...) ++ * See RFC793 and RFC1122. The RFC writes these in capitals. ++ */ ++ __u32 rcv_nxt; /* What we want to receive next */ ++ __u32 snd_nxt; /* Next sequence we send */ ++ ++ __u32 snd_una; /* First byte we want an ack for */ ++ __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ ++ __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ ++ __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ ++ ++ /* Delayed ACK control data */ ++ struct { ++ __u8 pending; /* ACK is pending */ ++ __u8 quick; /* Scheduled number of quick acks */ ++ __u8 pingpong; /* The session is interactive */ ++ __u8 blocked; /* Delayed ACK was blocked by socket lock*/ ++ __u32 ato; /* Predicted tick of soft clock */ ++ unsigned long timeout; /* Currently scheduled timeout */ ++ __u32 lrcvtime; /* timestamp of last received data packet*/ ++ __u16 last_seg_size; /* Size of last incoming segment */ ++ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ ++ } ack; ++ ++ /* Data for direct copy to user */ ++ struct { ++ struct sk_buff_head prequeue; ++ struct task_struct *task; ++ struct iovec *iov; ++ int memory; ++ int len; ++ } ucopy; ++ ++ __u32 snd_wl1; /* Sequence for window update */ ++ __u32 snd_wnd; /* The window we expect to receive */ ++ __u32 max_window; /* Maximal window ever seen from peer */ ++ __u32 pmtu_cookie; /* Last pmtu seen by socket */ ++ __u16 mss_cache; /* Cached effective mss, not including SACKS */ ++ __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ ++ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ ++ __u8 ca_state; /* State of fast-retransmit machine */ ++ __u8 retransmits; /* Number of unrecovered RTO timeouts. */ ++ ++ __u8 reordering; /* Packet reordering metric. */ ++ __u8 queue_shrunk; /* Write queue has been shrunk recently.*/ ++ __u8 defer_accept; /* User waits for some data after accept() */ ++ ++/* RTT measurement */ ++ __u8 backoff; /* backoff */ ++ __u32 srtt; /* smothed round trip time << 3 */ ++ __u32 mdev; /* medium deviation */ ++ __u32 mdev_max; /* maximal mdev for the last rtt period */ ++ __u32 rttvar; /* smoothed mdev_max */ ++ __u32 rtt_seq; /* sequence number to update rttvar */ ++ __u32 rto; /* retransmit timeout */ ++ ++ __u32 packets_out; /* Packets which are "in flight" */ ++ __u32 left_out; /* Packets which leaved network */ ++ __u32 retrans_out; /* Retransmitted packets out */ ++ ++ ++/* ++ * Slow start and congestion control (see also Nagle, and Karn & Partridge) ++ */ ++ __u32 snd_ssthresh; /* Slow start size threshold */ ++ __u32 snd_cwnd; /* Sending congestion window */ ++ __u16 snd_cwnd_cnt; /* Linear increase counter */ ++ __u16 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ ++ __u32 snd_cwnd_used; ++ __u32 snd_cwnd_stamp; ++ ++ /* Two commonly used timers in both sender and receiver paths. */ ++ unsigned long timeout; ++ struct timer_list retransmit_timer; /* Resend (no ack) */ ++ struct timer_list delack_timer; /* Ack delay */ ++ ++ struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ ++ ++ struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ ++ struct sk_buff *send_head; /* Front of stuff to transmit */ ++ struct page *sndmsg_page; /* Cached page for sendmsg */ ++ u32 sndmsg_off; /* Cached offset for sendmsg */ ++ ++ __u32 rcv_wnd; /* Current receiver window */ ++ __u32 rcv_wup; /* rcv_nxt on last window update sent */ ++ __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ ++ __u32 pushed_seq; /* Last pushed seq, required to talk to windows */ ++ __u32 copied_seq; /* Head of yet unread data */ ++/* ++ * Options received (usually on last packet, some only on SYN packets). ++ */ ++ char tstamp_ok, /* TIMESTAMP seen on SYN packet */ ++ wscale_ok, /* Wscale seen on SYN packet */ ++ sack_ok; /* SACK seen on SYN packet */ ++ char saw_tstamp; /* Saw TIMESTAMP on last packet */ ++ __u8 snd_wscale; /* Window scaling received from sender */ ++ __u8 rcv_wscale; /* Window scaling to send to receiver */ ++ __u8 nonagle; /* Disable Nagle algorithm? */ ++ __u8 keepalive_probes; /* num of allowed keep alive probes */ ++ ++/* PAWS/RTTM data */ ++ __u32 rcv_tsval; /* Time stamp value */ ++ __u32 rcv_tsecr; /* Time stamp echo reply */ ++ __u32 ts_recent; /* Time stamp to echo next */ ++ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ ++ ++/* SACKs data */ ++ __u16 user_mss; /* mss requested by user in ioctl */ ++ __u8 dsack; /* D-SACK is scheduled */ ++ __u8 eff_sacks; /* Size of SACK array to send with next packet */ ++ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ ++ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ ++ ++ __u32 window_clamp; /* Maximal window to advertise */ ++ __u32 rcv_ssthresh; /* Current window clamp */ ++ __u8 probes_out; /* unanswered 0 window probes */ ++ __u8 num_sacks; /* Number of SACK blocks */ ++ __u16 advmss; /* Advertised MSS */ ++ ++ __u8 syn_retries; /* num of allowed syn retries */ ++ __u8 ecn_flags; /* ECN status bits. */ ++ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ ++ __u32 lost_out; /* Lost packets */ ++ __u32 sacked_out; /* SACK'd packets */ ++ __u32 fackets_out; /* FACK'd packets */ ++ __u32 high_seq; /* snd_nxt at onset of congestion */ ++ ++ __u32 retrans_stamp; /* Timestamp of the last retransmit, ++ * also used in SYN-SENT to remember stamp of ++ * the first SYN. */ ++ __u32 undo_marker; /* tracking retrans started here. */ ++ int undo_retrans; /* number of undoable retransmissions. */ ++ __u32 urg_seq; /* Seq of received urgent pointer */ ++ __u16 urg_data; /* Saved octet of OOB data and control flags */ ++ __u8 pending; /* Scheduled timer event */ ++ __u8 urg_mode; /* In urgent mode */ ++ __u32 snd_up; /* Urgent pointer */ ++ ++ /* The syn_wait_lock is necessary only to avoid tcp_get_info having ++ * to grab the main lock sock while browsing the listening hash ++ * (otherwise it's deadlock prone). ++ * This lock is acquired in read mode only from tcp_get_info() and ++ * it's acquired in write mode _only_ from code that is actively ++ * changing the syn_wait_queue. All readers that are holding ++ * the master sock lock don't need to grab this lock in read mode ++ * too as the syn_wait_queue writes are always protected from ++ * the main sock lock. ++ */ ++ rwlock_t syn_wait_lock; ++ struct tcp_listen_opt *listen_opt; ++ ++ /* FIFO of established children */ ++ struct open_request *accept_queue; ++ struct open_request *accept_queue_tail; ++ ++ int write_pending; /* A write to socket waits to start. */ ++ ++ unsigned int keepalive_time; /* time before keep alive takes place */ ++ unsigned int keepalive_intvl; /* time interval between keep alive probes */ ++ int linger2; ++ ++ __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ ++ __u8 frto_counter; /* Number of new acks after RTO */ ++ __u32 frto_highmark; /* snd_nxt when RTO occurred */ ++ ++ unsigned long last_synq_overflow; ++ ++/* Receiver side RTT estimation */ ++ struct { ++ __u32 rtt; ++ __u32 seq; ++ __u32 time; ++ } rcv_rtt_est; ++ ++/* Receiver queue space */ ++ struct { ++ int space; ++ __u32 seq; ++ __u32 time; ++ } rcvq_space; ++ ++/* TCP Westwood structure */ ++ struct { ++ __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ ++ __u32 bw_est; /* bandwidth estimate */ ++ __u32 rtt_win_sx; /* here starts a new evaluation... */ ++ __u32 bk; ++ __u32 snd_una; /* used for evaluating the number of acked bytes */ ++ __u32 cumul_ack; ++ __u32 accounted; ++ __u32 rtt; ++ __u32 rtt_min; /* minimum observed RTT */ ++ } westwood; ++ ++/* Vegas variables */ ++ struct { ++ __u32 beg_snd_nxt; /* right edge during last RTT */ ++ __u32 beg_snd_una; /* left edge during last RTT */ ++ __u32 beg_snd_cwnd; /* saves the size of the cwnd */ ++ __u8 doing_vegas_now;/* if true, do vegas for this RTT */ ++ __u16 cntRTT; /* # of RTTs measured within last RTT */ ++ __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ ++ __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ ++ } vegas; ++ ++ /* BI TCP Parameters */ ++ struct { ++ __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ ++ __u32 last_max_cwnd; /* last maximium snd_cwnd */ ++ __u32 last_cwnd; /* the last snd_cwnd */ ++ __u32 last_stamp; /* time when updated last_cwnd */ ++ } bictcp; ++}; ++ ++ ++/* ++ * This structure really needs to be cleaned up. ++ * Most of it is for TCP, and not used by any of ++ * the other protocols. ++ */ ++ ++/* ++ * The idea is to start moving to a newer struct gradualy ++ * ++ * IMHO the newer struct should have the following format: ++ * ++ * struct sock { ++ * sockmem [mem, proto, callbacks] ++ * ++ * union or struct { ++ * ax25; ++ * } ll_pinfo; ++ * ++ * union { ++ * ipv4; ++ * ipv6; ++ * ipx; ++ * netrom; ++ * rose; ++ * x25; ++ * } net_pinfo; ++ * ++ * union { ++ * tcp; ++ * udp; ++ * spx; ++ * netrom; ++ * } tp_pinfo; ++ * ++ * } ++ * ++ * The idea failed because IPv6 transition asssumes dual IP/IPv6 sockets. ++ * So, net_pinfo is IPv6 are really, and protinfo unifies all another ++ * private areas. ++ */ ++ ++/* Define this to get the sk->debug debugging facility. */ ++#define SOCK_DEBUGGING ++#ifdef SOCK_DEBUGGING ++#define SOCK_DEBUG(sk, msg...) do { if((sk) && ((sk)->debug)) printk(KERN_DEBUG msg); } while (0) ++#else ++#define SOCK_DEBUG(sk, msg...) do { } while (0) ++#endif ++ ++/* This is the per-socket lock. The spinlock provides a synchronization ++ * between user contexts and software interrupt processing, whereas the ++ * mini-semaphore synchronizes multiple users amongst themselves. ++ */ ++typedef struct { ++ spinlock_t slock; ++ unsigned int users; ++ wait_queue_head_t wq; ++} socket_lock_t; ++ ++#define sock_lock_init(__sk) \ ++do { spin_lock_init(&((__sk)->lock.slock)); \ ++ (__sk)->lock.users = 0; \ ++ init_waitqueue_head(&((__sk)->lock.wq)); \ ++} while(0) ++ ++struct sock { ++ /* Socket demultiplex comparisons on incoming packets. */ ++ __u32 daddr; /* Foreign IPv4 addr */ ++ __u32 rcv_saddr; /* Bound local IPv4 addr */ ++ __u16 dport; /* Destination port */ ++ unsigned short num; /* Local port */ ++ int bound_dev_if; /* Bound device index if != 0 */ ++ ++ /* Main hash linkage for various protocol lookup tables. */ ++ struct sock *next; ++ struct sock **pprev; ++ struct sock *bind_next; ++ struct sock **bind_pprev; ++ ++ volatile unsigned char state, /* Connection state */ ++ zapped; /* In ax25 & ipx means not linked */ ++ __u16 sport; /* Source port */ ++ ++ unsigned short family; /* Address family */ ++ unsigned char reuse; /* SO_REUSEADDR setting */ ++ unsigned char shutdown; ++ atomic_t refcnt; /* Reference count */ ++ ++ socket_lock_t lock; /* Synchronizer... */ ++ int rcvbuf; /* Size of receive buffer in bytes */ ++ ++ wait_queue_head_t *sleep; /* Sock wait queue */ ++ struct dst_entry *dst_cache; /* Destination cache */ ++ rwlock_t dst_lock; ++ atomic_t rmem_alloc; /* Receive queue bytes committed */ ++ struct sk_buff_head receive_queue; /* Incoming packets */ ++ atomic_t wmem_alloc; /* Transmit queue bytes committed */ ++ struct sk_buff_head write_queue; /* Packet sending queue */ ++ atomic_t omem_alloc; /* "o" is "option" or "other" */ ++ int wmem_queued; /* Persistent queue size */ ++ int forward_alloc; /* Space allocated forward. */ ++ __u32 saddr; /* Sending source */ ++ unsigned int allocation; /* Allocation mode */ ++ int sndbuf; /* Size of send buffer in bytes */ ++ struct sock *prev; ++ ++ /* Not all are volatile, but some are, so we might as well say they all are. ++ * XXX Make this a flag word -DaveM ++ */ ++ volatile char dead, ++ done, ++ urginline, ++ keepopen, ++ linger, ++ destroy, ++ no_check, ++ broadcast, ++ bsdism; ++ unsigned char debug; ++ unsigned char rcvtstamp; ++ unsigned char use_write_queue; ++ unsigned char userlocks; ++ /* Hole of 3 bytes. Try to pack. */ ++ int route_caps; ++ int proc; ++ unsigned long lingertime; ++ ++ int hashent; ++ struct sock *pair; ++ ++ /* The backlog queue is special, it is always used with ++ * the per-socket spinlock held and requires low latency ++ * access. Therefore we special case it's implementation. ++ */ ++ struct { ++ struct sk_buff *head; ++ struct sk_buff *tail; ++ } backlog; ++ ++ rwlock_t callback_lock; ++ ++ /* Error queue, rarely used. */ ++ struct sk_buff_head error_queue; ++ ++ struct proto *prot; ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ union { ++ struct ipv6_pinfo af_inet6; ++ } net_pinfo; ++#endif ++ ++ union { ++ struct tcp_opt af_tcp; ++#if defined(CONFIG_IP_SCTP) || defined (CONFIG_IP_SCTP_MODULE) ++ struct sctp_opt af_sctp; ++#endif ++#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) ++ struct raw_opt tp_raw4; ++#endif ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ struct raw6_opt tp_raw; ++#endif /* CONFIG_IPV6 */ ++#if defined(CONFIG_SPX) || defined (CONFIG_SPX_MODULE) ++ struct spx_opt af_spx; ++#endif /* CONFIG_SPX */ ++ ++ } tp_pinfo; ++ ++ int err, err_soft; /* Soft holds errors that don't ++ cause failure but are the cause ++ of a persistent failure not just ++ 'timed out' */ ++ unsigned short ack_backlog; ++ unsigned short max_ack_backlog; ++ __u32 priority; ++ unsigned short type; ++ unsigned char localroute; /* Route locally only */ ++ unsigned char protocol; ++ struct ucred peercred; ++ int rcvlowat; ++ long rcvtimeo; ++ long sndtimeo; ++ ++#ifdef CONFIG_FILTER ++ /* Socket Filtering Instructions */ ++ struct sk_filter *filter; ++#endif /* CONFIG_FILTER */ ++ ++ /* This is where all the private (optional) areas that don't ++ * overlap will eventually live. ++ */ ++ union { ++ void *destruct_hook; ++ struct unix_opt af_unix; ++#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) ++ struct inet_opt af_inet; ++#endif ++#if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) ++ struct atalk_sock af_at; ++#endif ++#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) ++ struct ipx_opt af_ipx; ++#endif ++#if defined (CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE) ++ struct dn_scp dn; ++#endif ++#if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE) ++ struct packet_opt *af_packet; ++#endif ++#if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE) ++ x25_cb *x25; ++#endif ++#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) ++ ax25_cb *ax25; ++#endif ++#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) ++ nr_cb *nr; ++#endif ++#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) ++ rose_cb *rose; ++#endif ++#if defined(CONFIG_PPPOE) || defined(CONFIG_PPPOE_MODULE) ++ struct pppox_opt *pppox; ++#endif ++ struct netlink_opt *af_netlink; ++#if defined(CONFIG_ECONET) || defined(CONFIG_ECONET_MODULE) ++ struct econet_opt *af_econet; ++#endif ++#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) ++ struct atm_vcc *af_atm; ++#endif ++#if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE) ++ struct irda_sock *irda; ++#endif ++#if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE) ++ struct wanpipe_opt *af_wanpipe; ++#endif ++ } protinfo; ++ ++ ++ /* This part is used for the timeout functions. */ ++ struct timer_list timer; /* This is the sock cleanup timer. */ ++ struct timeval stamp; ++ ++ /* Identd and reporting IO signals */ ++ struct socket *socket; ++ ++ /* RPC layer private data */ ++ void *user_data; ++ ++ /* Callbacks */ ++ void (*state_change)(struct sock *sk); ++ void (*data_ready)(struct sock *sk,int bytes); ++ void (*write_space)(struct sock *sk); ++ void (*error_report)(struct sock *sk); ++ ++ int (*backlog_rcv) (struct sock *sk, ++ struct sk_buff *skb); ++ void (*destruct)(struct sock *sk); ++}; ++ ++/* The per-socket spinlock must be held here. */ ++#define sk_add_backlog(__sk, __skb) \ ++do { if((__sk)->backlog.tail == NULL) { \ ++ (__sk)->backlog.head = \ ++ (__sk)->backlog.tail = (__skb); \ ++ } else { \ ++ ((__sk)->backlog.tail)->next = (__skb); \ ++ (__sk)->backlog.tail = (__skb); \ ++ } \ ++ (__skb)->next = NULL; \ ++} while(0) ++ ++/* IP protocol blocks we attach to sockets. ++ * socket layer -> transport layer interface ++ * transport -> network interface is defined by struct inet_proto ++ */ ++struct proto { ++ void (*close)(struct sock *sk, ++ long timeout); ++ int (*connect)(struct sock *sk, ++ struct sockaddr *uaddr, ++ int addr_len); ++ int (*disconnect)(struct sock *sk, int flags); ++ ++ struct sock * (*accept) (struct sock *sk, int flags, int *err); ++ ++ int (*ioctl)(struct sock *sk, int cmd, ++ unsigned long arg); ++ int (*init)(struct sock *sk); ++ int (*destroy)(struct sock *sk); ++ void (*shutdown)(struct sock *sk, int how); ++ int (*setsockopt)(struct sock *sk, int level, ++ int optname, char *optval, int optlen); ++ int (*getsockopt)(struct sock *sk, int level, ++ int optname, char *optval, ++ int *option); ++ int (*sendmsg)(struct sock *sk, struct msghdr *msg, ++ int len); ++ int (*recvmsg)(struct sock *sk, struct msghdr *msg, ++ int len, int noblock, int flags, ++ int *addr_len); ++ int (*bind)(struct sock *sk, ++ struct sockaddr *uaddr, int addr_len); ++ ++ int (*backlog_rcv) (struct sock *sk, ++ struct sk_buff *skb); ++ ++ /* Keeping track of sk's, looking them up, and port selection methods. */ ++ void (*hash)(struct sock *sk); ++ void (*unhash)(struct sock *sk); ++ int (*get_port)(struct sock *sk, unsigned short snum); ++ ++ char name[32]; ++ ++ struct { ++ int inuse; ++ u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; ++ } stats[NR_CPUS]; ++}; ++ ++/* Called with local bh disabled */ ++static __inline__ void sock_prot_inc_use(struct proto *prot) ++{ ++ prot->stats[smp_processor_id()].inuse++; ++} ++ ++static __inline__ void sock_prot_dec_use(struct proto *prot) ++{ ++ prot->stats[smp_processor_id()].inuse--; ++} ++ ++/* About 10 seconds */ ++#define SOCK_DESTROY_TIME (10*HZ) ++ ++/* Sockets 0-1023 can't be bound to unless you are superuser */ ++#define PROT_SOCK 1024 ++ ++#define SHUTDOWN_MASK 3 ++#define RCV_SHUTDOWN 1 ++#define SEND_SHUTDOWN 2 ++ ++#define SOCK_SNDBUF_LOCK 1 ++#define SOCK_RCVBUF_LOCK 2 ++#define SOCK_BINDADDR_LOCK 4 ++#define SOCK_BINDPORT_LOCK 8 ++ ++ ++/* Used by processes to "lock" a socket state, so that ++ * interrupts and bottom half handlers won't change it ++ * from under us. It essentially blocks any incoming ++ * packets, so that we won't get any new data or any ++ * packets that change the state of the socket. ++ * ++ * While locked, BH processing will add new packets to ++ * the backlog queue. This queue is processed by the ++ * owner of the socket lock right before it is released. ++ * ++ * Since ~2.3.5 it is also exclusive sleep lock serializing ++ * accesses from user process context. ++ */ ++extern void __lock_sock(struct sock *sk); ++extern void __release_sock(struct sock *sk); ++#define lock_sock(__sk) \ ++do { spin_lock_bh(&((__sk)->lock.slock)); \ ++ if ((__sk)->lock.users != 0) \ ++ __lock_sock(__sk); \ ++ (__sk)->lock.users = 1; \ ++ spin_unlock_bh(&((__sk)->lock.slock)); \ ++} while(0) ++ ++#define release_sock(__sk) \ ++do { spin_lock_bh(&((__sk)->lock.slock)); \ ++ if ((__sk)->backlog.tail != NULL) \ ++ __release_sock(__sk); \ ++ (__sk)->lock.users = 0; \ ++ if (waitqueue_active(&((__sk)->lock.wq))) wake_up(&((__sk)->lock.wq)); \ ++ spin_unlock_bh(&((__sk)->lock.slock)); \ ++} while(0) ++ ++/* BH context may only use the following locking interface. */ ++#define bh_lock_sock(__sk) spin_lock(&((__sk)->lock.slock)) ++#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->lock.slock)) ++ ++extern struct sock * sk_alloc(int family, int priority, int zero_it); ++extern void sk_free(struct sock *sk); ++ ++extern struct sk_buff *sock_wmalloc(struct sock *sk, ++ unsigned long size, int force, ++ int priority); ++extern struct sk_buff *sock_rmalloc(struct sock *sk, ++ unsigned long size, int force, ++ int priority); ++extern void sock_wfree(struct sk_buff *skb); ++extern void sock_rfree(struct sk_buff *skb); ++ ++extern int sock_setsockopt(struct socket *sock, int level, ++ int op, char *optval, ++ int optlen); ++ ++extern int sock_getsockopt(struct socket *sock, int level, ++ int op, char *optval, ++ int *optlen); ++extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, ++ unsigned long size, ++ int noblock, ++ int *errcode); ++extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, ++ unsigned long header_len, ++ unsigned long data_len, ++ int noblock, ++ int *errcode); ++extern void *sock_kmalloc(struct sock *sk, int size, int priority); ++extern void sock_kfree_s(struct sock *sk, void *mem, int size); ++ ++/* ++ * Functions to fill in entries in struct proto_ops when a protocol ++ * does not implement a particular function. ++ */ ++extern int sock_no_release(struct socket *); ++extern int sock_no_bind(struct socket *, ++ struct sockaddr *, int); ++extern int sock_no_connect(struct socket *, ++ struct sockaddr *, int, int); ++extern int sock_no_socketpair(struct socket *, ++ struct socket *); ++extern int sock_no_accept(struct socket *, ++ struct socket *, int); ++extern int sock_no_getname(struct socket *, ++ struct sockaddr *, int *, int); ++extern unsigned int sock_no_poll(struct file *, struct socket *, ++ struct poll_table_struct *); ++extern int sock_no_ioctl(struct socket *, unsigned int, ++ unsigned long); ++extern int sock_no_listen(struct socket *, int); ++extern int sock_no_shutdown(struct socket *, int); ++extern int sock_no_getsockopt(struct socket *, int , int, ++ char *, int *); ++extern int sock_no_setsockopt(struct socket *, int, int, ++ char *, int); ++extern int sock_no_fcntl(struct socket *, ++ unsigned int, unsigned long); ++extern int sock_no_sendmsg(struct socket *, ++ struct msghdr *, int, ++ struct scm_cookie *); ++extern int sock_no_recvmsg(struct socket *, ++ struct msghdr *, int, int, ++ struct scm_cookie *); ++extern int sock_no_mmap(struct file *file, ++ struct socket *sock, ++ struct vm_area_struct *vma); ++extern ssize_t sock_no_sendpage(struct socket *sock, ++ struct page *page, ++ int offset, size_t size, ++ int flags); ++ ++/* ++ * Default socket callbacks and setup code ++ */ ++ ++extern void sock_def_destruct(struct sock *); ++ ++/* Initialise core socket variables */ ++extern void sock_init_data(struct socket *sock, struct sock *sk); ++ ++extern void sklist_remove_socket(struct sock **list, struct sock *sk); ++extern void sklist_insert_socket(struct sock **list, struct sock *sk); ++extern void sklist_destroy_socket(struct sock **list, struct sock *sk); ++ ++#ifdef CONFIG_FILTER ++ ++/** ++ * sk_filter - run a packet through a socket filter ++ * @sk: sock associated with &sk_buff ++ * @skb: buffer to filter ++ * @needlock: set to 1 if the sock is not locked by caller. ++ * ++ * Run the filter code and then cut skb->data to correct size returned by ++ * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller ++ * than pkt_len we keep whole skb->data. This is the socket level ++ * wrapper to sk_run_filter. It returns 0 if the packet should ++ * be accepted or -EPERM if the packet should be tossed. ++ */ ++ ++static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock) ++{ ++ int err = 0; ++ ++ if (sk->filter) { ++ struct sk_filter *filter; ++ ++ if (needlock) ++ bh_lock_sock(sk); ++ ++ filter = sk->filter; ++ if (filter) { ++ int pkt_len = sk_run_filter(skb, filter->insns, ++ filter->len); ++ if (!pkt_len) ++ err = -EPERM; ++ else ++ skb_trim(skb, pkt_len); ++ } ++ ++ if (needlock) ++ bh_unlock_sock(sk); ++ } ++ return err; ++} ++ ++/** ++ * sk_filter_release: Release a socket filter ++ * @sk: socket ++ * @fp: filter to remove ++ * ++ * Remove a filter from a socket and release its resources. ++ */ ++ ++static inline void sk_filter_release(struct sock *sk, struct sk_filter *fp) ++{ ++ unsigned int size = sk_filter_len(fp); ++ ++ atomic_sub(size, &sk->omem_alloc); ++ ++ if (atomic_dec_and_test(&fp->refcnt)) ++ kfree(fp); ++} ++ ++static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) ++{ ++ atomic_inc(&fp->refcnt); ++ atomic_add(sk_filter_len(fp), &sk->omem_alloc); ++} ++ ++#else ++ ++static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_FILTER */ ++ ++/* ++ * Socket reference counting postulates. ++ * ++ * * Each user of socket SHOULD hold a reference count. ++ * * Each access point to socket (an hash table bucket, reference from a list, ++ * running timer, skb in flight MUST hold a reference count. ++ * * When reference count hits 0, it means it will never increase back. ++ * * When reference count hits 0, it means that no references from ++ * outside exist to this socket and current process on current CPU ++ * is last user and may/should destroy this socket. ++ * * sk_free is called from any context: process, BH, IRQ. When ++ * it is called, socket has no references from outside -> sk_free ++ * may release descendant resources allocated by the socket, but ++ * to the time when it is called, socket is NOT referenced by any ++ * hash tables, lists etc. ++ * * Packets, delivered from outside (from network or from another process) ++ * and enqueued on receive/error queues SHOULD NOT grab reference count, ++ * when they sit in queue. Otherwise, packets will leak to hole, when ++ * socket is looked up by one cpu and unhasing is made by another CPU. ++ * It is true for udp/raw, netlink (leak to receive and error queues), tcp ++ * (leak to backlog). Packet socket does all the processing inside ++ * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets ++ * use separate SMP lock, so that they are prone too. ++ */ ++ ++/* Grab socket reference count. This operation is valid only ++ when sk is ALREADY grabbed f.e. it is found in hash table ++ or a list and the lookup is made under lock preventing hash table ++ modifications. ++ */ ++ ++static inline void sock_hold(struct sock *sk) ++{ ++ atomic_inc(&sk->refcnt); ++} ++ ++/* Ungrab socket in the context, which assumes that socket refcnt ++ cannot hit zero, f.e. it is true in context of any socketcall. ++ */ ++static inline void __sock_put(struct sock *sk) ++{ ++ atomic_dec(&sk->refcnt); ++} ++ ++/* Ungrab socket and destroy it, if it was the last reference. */ ++static inline void sock_put(struct sock *sk) ++{ ++ if (atomic_dec_and_test(&sk->refcnt)) ++ sk_free(sk); ++} ++ ++/* Detach socket from process context. ++ * Announce socket dead, detach it from wait queue and inode. ++ * Note that parent inode held reference count on this struct sock, ++ * we do not release it in this function, because protocol ++ * probably wants some additional cleanups or even continuing ++ * to work with this socket (TCP). ++ */ ++static inline void sock_orphan(struct sock *sk) ++{ ++ write_lock_bh(&sk->callback_lock); ++ sk->dead = 1; ++ sk->socket = NULL; ++ sk->sleep = NULL; ++ write_unlock_bh(&sk->callback_lock); ++} ++ ++static inline void sock_graft(struct sock *sk, struct socket *parent) ++{ ++ write_lock_bh(&sk->callback_lock); ++ sk->sleep = &parent->wait; ++ parent->sk = sk; ++ sk->socket = parent; ++ write_unlock_bh(&sk->callback_lock); ++} ++ ++static inline int sock_i_uid(struct sock *sk) ++{ ++ int uid; ++ ++ read_lock(&sk->callback_lock); ++ uid = sk->socket ? sk->socket->inode->i_uid : 0; ++ read_unlock(&sk->callback_lock); ++ return uid; ++} ++ ++static inline unsigned long sock_i_ino(struct sock *sk) ++{ ++ unsigned long ino; ++ ++ read_lock(&sk->callback_lock); ++ ino = sk->socket ? sk->socket->inode->i_ino : 0; ++ read_unlock(&sk->callback_lock); ++ return ino; ++} ++ ++static inline struct dst_entry * ++__sk_dst_get(struct sock *sk) ++{ ++ return sk->dst_cache; ++} ++ ++static inline struct dst_entry * ++sk_dst_get(struct sock *sk) ++{ ++ struct dst_entry *dst; ++ ++ read_lock(&sk->dst_lock); ++ dst = sk->dst_cache; ++ if (dst) ++ dst_hold(dst); ++ read_unlock(&sk->dst_lock); ++ return dst; ++} ++ ++static inline void ++__sk_dst_set(struct sock *sk, struct dst_entry *dst) ++{ ++ struct dst_entry *old_dst; ++ ++ old_dst = sk->dst_cache; ++ sk->dst_cache = dst; ++ dst_release(old_dst); ++} ++ ++static inline void ++sk_dst_set(struct sock *sk, struct dst_entry *dst) ++{ ++ write_lock(&sk->dst_lock); ++ __sk_dst_set(sk, dst); ++ write_unlock(&sk->dst_lock); ++} ++ ++static inline void ++__sk_dst_reset(struct sock *sk) ++{ ++ struct dst_entry *old_dst; ++ ++ old_dst = sk->dst_cache; ++ sk->dst_cache = NULL; ++ dst_release(old_dst); ++} ++ ++static inline void ++sk_dst_reset(struct sock *sk) ++{ ++ write_lock(&sk->dst_lock); ++ __sk_dst_reset(sk); ++ write_unlock(&sk->dst_lock); ++} ++ ++static inline struct dst_entry * ++__sk_dst_check(struct sock *sk, u32 cookie) ++{ ++ struct dst_entry *dst = sk->dst_cache; ++ ++ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { ++ sk->dst_cache = NULL; ++ return NULL; ++ } ++ ++ return dst; ++} ++ ++static inline struct dst_entry * ++sk_dst_check(struct sock *sk, u32 cookie) ++{ ++ struct dst_entry *dst = sk_dst_get(sk); ++ ++ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { ++ sk_dst_reset(sk); ++ return NULL; ++ } ++ ++ return dst; ++} ++ ++ ++/* ++ * Queue a received datagram if it will fit. Stream and sequenced ++ * protocols can't normally use this as they need to fit buffers in ++ * and play with them. ++ * ++ * Inlined as it's very short and called for pretty much every ++ * packet ever received. ++ */ ++ ++static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) ++{ ++ sock_hold(sk); ++ skb->sk = sk; ++ skb->destructor = sock_wfree; ++ atomic_add(skb->truesize, &sk->wmem_alloc); ++} ++ ++static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) ++{ ++ skb->sk = sk; ++ skb->destructor = sock_rfree; ++ atomic_add(skb->truesize, &sk->rmem_alloc); ++} ++ ++static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ int err = 0; ++ int skb_len; ++ ++ /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces ++ number of warnings when compiling with -W --ANK ++ */ ++ if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ /* It would be deadlock, if sock_queue_rcv_skb is used ++ with socket lock! We assume that users of this ++ function are lock free. ++ */ ++ err = sk_filter(sk, skb, 1); ++ if (err) ++ goto out; ++ ++ skb->dev = NULL; ++ skb_set_owner_r(skb, sk); ++ ++ /* Cache the SKB length before we tack it onto the receive ++ * queue. Once it is added it no longer belongs to us and ++ * may be freed by other threads of control pulling packets ++ * from the queue. ++ */ ++ skb_len = skb->len; ++ ++ skb_queue_tail(&sk->receive_queue, skb); ++ if (!sk->dead) ++ sk->data_ready(sk,skb_len); ++out: ++ return err; ++} ++ ++static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces ++ number of warnings when compiling with -W --ANK ++ */ ++ if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf) ++ return -ENOMEM; ++ skb_set_owner_r(skb, sk); ++ skb_queue_tail(&sk->error_queue,skb); ++ if (!sk->dead) ++ sk->data_ready(sk,skb->len); ++ return 0; ++} ++ ++/* ++ * Recover an error report and clear atomically ++ */ ++ ++static inline int sock_error(struct sock *sk) ++{ ++ int err=xchg(&sk->err,0); ++ return -err; ++} ++ ++static inline unsigned long sock_wspace(struct sock *sk) ++{ ++ int amt = 0; ++ ++ if (!(sk->shutdown & SEND_SHUTDOWN)) { ++ amt = sk->sndbuf - atomic_read(&sk->wmem_alloc); ++ if (amt < 0) ++ amt = 0; ++ } ++ return amt; ++} ++ ++static inline void sk_wake_async(struct sock *sk, int how, int band) ++{ ++ if (sk->socket && sk->socket->fasync_list) ++ sock_wake_async(sk->socket, how, band); ++} ++ ++#define SOCK_MIN_SNDBUF 2048 ++#define SOCK_MIN_RCVBUF 256 ++ ++/* ++ * Default write policy as shown to user space via poll/select/SIGIO ++ */ ++static inline int sock_writeable(struct sock *sk) ++{ ++ return atomic_read(&sk->wmem_alloc) < (sk->sndbuf / 2); ++} ++ ++static inline int gfp_any(void) ++{ ++ return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; ++} ++ ++static inline long sock_rcvtimeo(struct sock *sk, int noblock) ++{ ++ return noblock ? 0 : sk->rcvtimeo; ++} ++ ++static inline long sock_sndtimeo(struct sock *sk, int noblock) ++{ ++ return noblock ? 0 : sk->sndtimeo; ++} ++ ++static inline int sock_rcvlowat(struct sock *sk, int waitall, int len) ++{ ++ return (waitall ? len : min_t(int, sk->rcvlowat, len)) ? : 1; ++} ++ ++/* Alas, with timeout socket operations are not restartable. ++ * Compare this to poll(). ++ */ ++static inline int sock_intr_errno(long timeo) ++{ ++ return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; ++} ++ ++static __inline__ void ++sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) ++{ ++ if (sk->rcvtstamp) ++ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(skb->stamp), &skb->stamp); ++ else ++ sk->stamp = skb->stamp; ++} ++ ++/* ++ * Enable debug/info messages ++ */ ++ ++#if 0 ++#define NETDEBUG(x) do { } while (0) ++#else ++#define NETDEBUG(x) do { x; } while (0) ++#endif ++ ++/* ++ * Macros for sleeping on a socket. Use them like this: ++ * ++ * SOCK_SLEEP_PRE(sk) ++ * if (condition) ++ * schedule(); ++ * SOCK_SLEEP_POST(sk) ++ * ++ */ ++ ++#define SOCK_SLEEP_PRE(sk) { struct task_struct *tsk = current; \ ++ DECLARE_WAITQUEUE(wait, tsk); \ ++ tsk->state = TASK_INTERRUPTIBLE; \ ++ add_wait_queue((sk)->sleep, &wait); \ ++ release_sock(sk); ++ ++#define SOCK_SLEEP_POST(sk) tsk->state = TASK_RUNNING; \ ++ remove_wait_queue((sk)->sleep, &wait); \ ++ lock_sock(sk); \ ++ } ++ ++extern __u32 sysctl_wmem_max; ++extern __u32 sysctl_rmem_max; ++ ++#endif /* _SOCK_H */ +diff --unified --recursive --new-file linux-2.4.30/net/Config.in linux-2.4.30-1-686-smp-ring3/net/Config.in +--- linux-2.4.30/net/Config.in 2005-01-19 15:10:13.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/Config.in 2005-10-22 23:08:28.028051250 +0200 +@@ -15,6 +15,9 @@ + bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG + fi + bool 'Socket Filtering' CONFIG_FILTER ++if [ "$CONFIG_EXPERIMENTAL" = "y" -a "$CONFIG_FILTER" = "y" ]; then ++ source net/ring/Config.in ++fi + tristate 'Unix domain sockets' CONFIG_UNIX + bool 'TCP/IP networking' CONFIG_INET + if [ "$CONFIG_INET" = "y" ]; then +diff --unified --recursive --new-file linux-2.4.30/net/Config.in.ORG linux-2.4.30-1-686-smp-ring3/net/Config.in.ORG +--- linux-2.4.30/net/Config.in.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/Config.in.ORG 2005-10-22 23:08:28.020050750 +0200 +@@ -0,0 +1,107 @@ ++# ++# Network configuration ++# ++mainmenu_option next_comment ++comment 'Networking options' ++tristate 'Packet socket' CONFIG_PACKET ++if [ "$CONFIG_PACKET" != "n" ]; then ++ bool ' Packet socket: mmapped IO' CONFIG_PACKET_MMAP ++fi ++ ++tristate 'Netlink device emulation' CONFIG_NETLINK_DEV ++ ++bool 'Network packet filtering (replaces ipchains)' CONFIG_NETFILTER ++if [ "$CONFIG_NETFILTER" = "y" ]; then ++ bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG ++fi ++bool 'Socket Filtering' CONFIG_FILTER ++tristate 'Unix domain sockets' CONFIG_UNIX ++bool 'TCP/IP networking' CONFIG_INET ++if [ "$CONFIG_INET" = "y" ]; then ++ source net/ipv4/Config.in ++ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++# IPv6 as module will cause a CRASH if you try to unload it ++ tristate ' The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6 ++ if [ "$CONFIG_IPV6" != "n" ]; then ++ source net/ipv6/Config.in ++ fi ++ fi ++ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ source net/khttpd/Config.in ++ fi ++ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ source net/sctp/Config.in ++ fi ++fi ++if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ tristate 'Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)' CONFIG_ATM ++ if [ "$CONFIG_ATM" = "y" -o "$CONFIG_ATM" = "m" ]; then ++ if [ "$CONFIG_INET" = "y" ]; then ++ dep_tristate ' Classical IP over ATM' CONFIG_ATM_CLIP $CONFIG_ATM ++ if [ "$CONFIG_ATM_CLIP" != "n" ]; then ++ bool ' Do NOT send ICMP if no neighbour' CONFIG_ATM_CLIP_NO_ICMP ++ fi ++ fi ++ dep_tristate ' LAN Emulation (LANE) support' CONFIG_ATM_LANE $CONFIG_ATM ++ if [ "$CONFIG_INET" = "y" -a "$CONFIG_ATM_LANE" != "n" ]; then ++ tristate ' Multi-Protocol Over ATM (MPOA) support' CONFIG_ATM_MPOA ++ fi ++ dep_tristate ' RFC1483/2684 Bridged protocols' CONFIG_ATM_BR2684 $CONFIG_ATM ++ if [ "$CONFIG_ATM_BR2684" != "n" ]; then ++ bool ' Per-VC IP filter kludge' CONFIG_ATM_BR2684_IPFILTER ++ fi ++ fi ++fi ++tristate '802.1Q VLAN Support' CONFIG_VLAN_8021Q ++ ++comment ' ' ++tristate 'The IPX protocol' CONFIG_IPX ++if [ "$CONFIG_IPX" != "n" ]; then ++ source net/ipx/Config.in ++fi ++ ++tristate 'Appletalk protocol support' CONFIG_ATALK ++if [ "$CONFIG_ATALK" != "n" ]; then ++ source drivers/net/appletalk/Config.in ++fi ++ ++tristate 'DECnet Support' CONFIG_DECNET ++if [ "$CONFIG_DECNET" != "n" ]; then ++ source net/decnet/Config.in ++fi ++dep_tristate '802.1d Ethernet Bridging' CONFIG_BRIDGE $CONFIG_INET ++if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ tristate 'CCITT X.25 Packet Layer (EXPERIMENTAL)' CONFIG_X25 ++ tristate 'LAPB Data Link Driver (EXPERIMENTAL)' CONFIG_LAPB ++ bool '802.2 LLC (EXPERIMENTAL)' CONFIG_LLC ++ bool 'Frame Diverter (EXPERIMENTAL)' CONFIG_NET_DIVERT ++# if [ "$CONFIG_LLC" = "y" ]; then ++# bool ' Netbeui (EXPERIMENTAL)' CONFIG_NETBEUI ++# fi ++ if [ "$CONFIG_INET" = "y" ]; then ++ tristate 'Acorn Econet/AUN protocols (EXPERIMENTAL)' CONFIG_ECONET ++ if [ "$CONFIG_ECONET" != "n" ]; then ++ bool ' AUN over UDP' CONFIG_ECONET_AUNUDP ++ bool ' Native Econet' CONFIG_ECONET_NATIVE ++ fi ++ fi ++ tristate 'WAN router' CONFIG_WAN_ROUTER ++ bool 'Fast switching (read help!)' CONFIG_NET_FASTROUTE ++ bool 'Forwarding between high speed interfaces' CONFIG_NET_HW_FLOWCONTROL ++fi ++ ++mainmenu_option next_comment ++comment 'QoS and/or fair queueing' ++bool 'QoS and/or fair queueing' CONFIG_NET_SCHED ++if [ "$CONFIG_NET_SCHED" = "y" ]; then ++ source net/sched/Config.in ++fi ++#bool 'Network code profiler' CONFIG_NET_PROFILE ++endmenu ++ ++mainmenu_option next_comment ++comment 'Network testing' ++dep_tristate 'Packet Generator (USE WITH CAUTION)' CONFIG_NET_PKTGEN $CONFIG_PROC_FS ++endmenu ++ ++endmenu +diff --unified --recursive --new-file linux-2.4.30/net/Makefile linux-2.4.30-1-686-smp-ring3/net/Makefile +--- linux-2.4.30/net/Makefile 2004-08-08 01:26:06.000000000 +0200 ++++ linux-2.4.30-1-686-smp-ring3/net/Makefile 2005-10-22 23:08:27.928045000 +0200 +@@ -7,7 +7,7 @@ + + O_TARGET := network.o + +-mod-subdirs := ipv4/netfilter ipv6/netfilter ipx irda bluetooth atm netlink sched core sctp 802 ++mod-subdirs := ipv4/netfilter ipv6/netfilter ipx irda bluetooth atm netlink sched core sctp 802 ring + export-objs := netsyms.o + + subdir-y := core ethernet +@@ -46,6 +46,7 @@ + subdir-$(CONFIG_DECNET) += decnet + subdir-$(CONFIG_ECONET) += econet + subdir-$(CONFIG_VLAN_8021Q) += 8021q ++subdir-$(CONFIG_RING) += ring + + ifeq ($(CONFIG_NETFILTER),y) + mod-subdirs += ipv4/ipvs +diff --unified --recursive --new-file linux-2.4.30/net/Makefile.ORG linux-2.4.30-1-686-smp-ring3/net/Makefile.ORG +--- linux-2.4.30/net/Makefile.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/Makefile.ORG 2005-10-22 23:08:27.916044250 +0200 +@@ -0,0 +1,61 @@ ++# ++# Makefile for the linux networking. ++# ++# 2 Sep 2000, Christoph Hellwig ++# Rewritten to use lists instead of if-statements. ++# ++ ++O_TARGET := network.o ++ ++mod-subdirs := ipv4/netfilter ipv6/netfilter ipx irda bluetooth atm netlink sched core sctp 802 ++export-objs := netsyms.o ++ ++subdir-y := core ethernet ++subdir-m := ipv4 # hum? ++ ++ ++subdir-$(CONFIG_NET) += 802 sched netlink ++subdir-$(CONFIG_IPV6) += ipv6 ++subdir-$(CONFIG_INET) += ipv4 ++subdir-$(CONFIG_NETFILTER) += ipv4/netfilter ++subdir-$(CONFIG_UNIX) += unix ++subdir-$(CONFIG_IP_SCTP) += sctp ++ ++ifneq ($(CONFIG_IPV6),n) ++ifneq ($(CONFIG_IPV6),) ++subdir-$(CONFIG_NETFILTER) += ipv6/netfilter ++endif ++endif ++ ++subdir-$(CONFIG_KHTTPD) += khttpd ++subdir-$(CONFIG_PACKET) += packet ++subdir-$(CONFIG_NET_SCHED) += sched ++subdir-$(CONFIG_BRIDGE) += bridge ++subdir-$(CONFIG_IPX) += ipx ++subdir-$(CONFIG_ATALK) += appletalk ++subdir-$(CONFIG_WAN_ROUTER) += wanrouter ++subdir-$(CONFIG_X25) += x25 ++subdir-$(CONFIG_LAPB) += lapb ++subdir-$(CONFIG_NETROM) += netrom ++subdir-$(CONFIG_ROSE) += rose ++subdir-$(CONFIG_AX25) += ax25 ++subdir-$(CONFIG_IRDA) += irda ++subdir-$(CONFIG_BLUEZ) += bluetooth ++subdir-$(CONFIG_SUNRPC) += sunrpc ++subdir-$(CONFIG_ATM) += atm ++subdir-$(CONFIG_DECNET) += decnet ++subdir-$(CONFIG_ECONET) += econet ++subdir-$(CONFIG_VLAN_8021Q) += 8021q ++ ++ifeq ($(CONFIG_NETFILTER),y) ++ mod-subdirs += ipv4/ipvs ++ subdir-$(CONFIG_IP_VS) += ipv4/ipvs ++endif ++ ++obj-y := socket.o $(join $(subdir-y), $(patsubst %,/%.o,$(notdir $(subdir-y)))) ++ifeq ($(CONFIG_NET),y) ++obj-$(CONFIG_MODULES) += netsyms.o ++obj-$(CONFIG_SYSCTL) += sysctl_net.o ++endif ++ ++include $(TOPDIR)/Rules.make +diff --unified --recursive --new-file linux-2.4.30/net/core/dev.c linux-2.4.30-1-686-smp-ring3/net/core/dev.c +--- linux-2.4.30/net/core/dev.c 2005-04-04 03:42:20.000000000 +0200 ++++ linux-2.4.30-1-686-smp-ring3/net/core/dev.c 2005-10-22 23:08:27.900043250 +0200 +@@ -104,6 +104,56 @@ + #include /* Note : will define WIRELESS_EXT */ + #include + #endif /* CONFIG_NET_RADIO || CONFIG_NET_PCMCIA_RADIO */ ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ ++/* #define RING_DEBUG */ ++ ++#include ++#include ++ ++static handle_ring_skb ring_handler = NULL; ++ ++handle_ring_skb get_skb_ring_handler() { return(ring_handler); } ++ ++void set_skb_ring_handler(handle_ring_skb the_handler) { ++ ring_handler = the_handler; ++} ++ ++void do_skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb) { ++ if(ring_handler) ++ ring_handler(skb, recv_packet, real_skb); ++} ++ ++/* ******************* */ ++ ++static handle_ring_buffer buffer_ring_handler = NULL; ++ ++handle_ring_buffer get_buffer_ring_handler() { return(buffer_ring_handler); } ++ ++void set_buffer_ring_handler(handle_ring_buffer the_handler) { ++ buffer_ring_handler = the_handler; ++} ++ ++int do_buffer_ring_handler(struct net_device *dev, char *data, int len) { ++ if(buffer_ring_handler) { ++ buffer_ring_handler(dev, data, len); ++ return(1); ++ } else ++ return(0); ++} ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++EXPORT_SYMBOL(get_skb_ring_handler); ++EXPORT_SYMBOL(set_skb_ring_handler); ++EXPORT_SYMBOL(do_skb_ring_handler); ++ ++EXPORT_SYMBOL(get_buffer_ring_handler); ++EXPORT_SYMBOL(set_buffer_ring_handler); ++EXPORT_SYMBOL(do_buffer_ring_handler); ++#endif ++ ++#endif + #ifdef CONFIG_PLIP + extern int plip_init(void); + #endif +@@ -1066,6 +1116,10 @@ + return -ENOMEM; + } + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler) ring_handler(skb, 0, 1); ++#endif /* CONFIG_RING */ ++ + /* Grab device queue */ + spin_lock_bh(&dev->queue_lock); + q = dev->qdisc; +@@ -1278,6 +1332,13 @@ + struct softnet_data *queue; + unsigned long flags; + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler && ring_handler(skb, 1, 1)) { ++ /* The packet has been copied into a ring */ ++ return(NET_RX_SUCCESS); ++ } ++#endif /* CONFIG_RING */ ++ + if (skb->stamp.tv_sec == 0) + do_gettimeofday(&skb->stamp); + +@@ -1464,6 +1525,13 @@ + int ret = NET_RX_DROP; + unsigned short type; + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler && ring_handler(skb, 1, 1)) { ++ /* The packet has been copied into a ring */ ++ return(NET_RX_SUCCESS); ++ } ++#endif /* CONFIG_RING */ ++ + if (skb->stamp.tv_sec == 0) + do_gettimeofday(&skb->stamp); + +diff --unified --recursive --new-file linux-2.4.30/net/core/dev.c.ORG linux-2.4.30-1-686-smp-ring3/net/core/dev.c.ORG +--- linux-2.4.30/net/core/dev.c.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/core/dev.c.ORG 2005-10-22 23:08:27.472016500 +0200 +@@ -0,0 +1,2926 @@ ++/* ++ * NET3 Protocol independent device support routines. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Derived from the non IP parts of dev.c 1.0.19 ++ * Authors: Ross Biro, ++ * Fred N. van Kempen, ++ * Mark Evans, ++ * ++ * Additional Authors: ++ * Florian la Roche ++ * Alan Cox ++ * David Hinds ++ * Alexey Kuznetsov ++ * Adam Sulmicki ++ * Pekka Riikonen ++ * ++ * Changes: ++ * D.J. Barrow : Fixed bug where dev->refcnt gets set to 2 ++ * if register_netdev gets called before ++ * net_dev_init & also removed a few lines ++ * of code in the process. ++ * Alan Cox : device private ioctl copies fields back. ++ * Alan Cox : Transmit queue code does relevant stunts to ++ * keep the queue safe. ++ * Alan Cox : Fixed double lock. ++ * Alan Cox : Fixed promisc NULL pointer trap ++ * ???????? : Support the full private ioctl range ++ * Alan Cox : Moved ioctl permission check into drivers ++ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI ++ * Alan Cox : 100 backlog just doesn't cut it when ++ * you start doing multicast video 8) ++ * Alan Cox : Rewrote net_bh and list manager. ++ * Alan Cox : Fix ETH_P_ALL echoback lengths. ++ * Alan Cox : Took out transmit every packet pass ++ * Saved a few bytes in the ioctl handler ++ * Alan Cox : Network driver sets packet type before calling netif_rx. Saves ++ * a function call a packet. ++ * Alan Cox : Hashed net_bh() ++ * Richard Kooijman: Timestamp fixes. ++ * Alan Cox : Wrong field in SIOCGIFDSTADDR ++ * Alan Cox : Device lock protection. ++ * Alan Cox : Fixed nasty side effect of device close changes. ++ * Rudi Cilibrasi : Pass the right thing to set_mac_address() ++ * Dave Miller : 32bit quantity for the device lock to make it work out ++ * on a Sparc. ++ * Bjorn Ekwall : Added KERNELD hack. ++ * Alan Cox : Cleaned up the backlog initialise. ++ * Craig Metz : SIOCGIFCONF fix if space for under ++ * 1 device. ++ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there ++ * is no device open function. ++ * Andi Kleen : Fix error reporting for SIOCGIFCONF ++ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF ++ * Cyrus Durgin : Cleaned for KMOD ++ * Adam Sulmicki : Bug Fix : Network Device Unload ++ * A network device unload needs to purge ++ * the backlog queue. ++ * Paul Rusty Russell : SIOCSIFNAME ++ * Pekka Riikonen : Netdev boot-time settings code ++ * Andrew Morton : Make unregister_netdevice wait indefinitely on dev->refcnt ++ * J Hadi Salim : - Backlog queue sampling ++ * - netif_rx() feedback ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if defined(CONFIG_NET_RADIO) || defined(CONFIG_NET_PCMCIA_RADIO) ++#include /* Note : will define WIRELESS_EXT */ ++#include ++#endif /* CONFIG_NET_RADIO || CONFIG_NET_PCMCIA_RADIO */ ++#ifdef CONFIG_PLIP ++extern int plip_init(void); ++#endif ++ ++ ++/* This define, if set, will randomly drop a packet when congestion ++ * is more than moderate. It helps fairness in the multi-interface ++ * case when one of them is a hog, but it kills performance for the ++ * single interface case so it is off now by default. ++ */ ++#undef RAND_LIE ++ ++/* Setting this will sample the queue lengths and thus congestion ++ * via a timer instead of as each packet is received. ++ */ ++#undef OFFLINE_SAMPLE ++ ++NET_PROFILE_DEFINE(dev_queue_xmit) ++NET_PROFILE_DEFINE(softnet_process) ++ ++const char *if_port_text[] = { ++ "unknown", ++ "BNC", ++ "10baseT", ++ "AUI", ++ "100baseT", ++ "100baseTX", ++ "100baseFX" ++}; ++ ++/* ++ * The list of packet types we will receive (as opposed to discard) ++ * and the routines to invoke. ++ * ++ * Why 16. Because with 16 the only overlap we get on a hash of the ++ * low nibble of the protocol value is RARP/SNAP/X.25. ++ * ++ * NOTE: That is no longer true with the addition of VLAN tags. Not ++ * sure which should go first, but I bet it won't make much ++ * difference if we are running VLANs. The good news is that ++ * this protocol won't be in the list unless compiled in, so ++ * the average user (w/out VLANs) will not be adversly affected. ++ * --BLG ++ * ++ * 0800 IP ++ * 8100 802.1Q VLAN ++ * 0001 802.3 ++ * 0002 AX.25 ++ * 0004 802.2 ++ * 8035 RARP ++ * 0005 SNAP ++ * 0805 X.25 ++ * 0806 ARP ++ * 8137 IPX ++ * 0009 Localtalk ++ * 86DD IPv6 ++ */ ++ ++static struct packet_type *ptype_base[16]; /* 16 way hashed list */ ++static struct packet_type *ptype_all = NULL; /* Taps */ ++ ++#ifdef OFFLINE_SAMPLE ++static void sample_queue(unsigned long dummy); ++static struct timer_list samp_timer = { function: sample_queue }; ++#endif ++ ++#ifdef CONFIG_HOTPLUG ++static int net_run_sbin_hotplug(struct net_device *dev, char *action); ++#else ++#define net_run_sbin_hotplug(dev, action) ({ 0; }) ++#endif ++ ++/* ++ * Our notifier list ++ */ ++ ++static struct notifier_block *netdev_chain=NULL; ++ ++/* ++ * Device drivers call our routines to queue packets here. We empty the ++ * queue in the local softnet handler. ++ */ ++struct softnet_data softnet_data[NR_CPUS] __cacheline_aligned; ++ ++#ifdef CONFIG_NET_FASTROUTE ++int netdev_fastroute; ++int netdev_fastroute_obstacles; ++#endif ++ ++ ++/****************************************************************************************** ++ ++ Protocol management and registration routines ++ ++*******************************************************************************************/ ++ ++/* ++ * For efficiency ++ */ ++ ++int netdev_nit=0; ++ ++/* ++ * Add a protocol ID to the list. Now that the input handler is ++ * smarter we can dispense with all the messy stuff that used to be ++ * here. ++ * ++ * BEWARE!!! Protocol handlers, mangling input packets, ++ * MUST BE last in hash buckets and checking protocol handlers ++ * MUST start from promiscous ptype_all chain in net_bh. ++ * It is true now, do not change it. ++ * Explantion follows: if protocol handler, mangling packet, will ++ * be the first on list, it is not able to sense, that packet ++ * is cloned and should be copied-on-write, so that it will ++ * change it and subsequent readers will get broken packet. ++ * --ANK (980803) ++ */ ++ ++/** ++ * dev_add_pack - add packet handler ++ * @pt: packet type declaration ++ * ++ * Add a protocol handler to the networking stack. The passed &packet_type ++ * is linked into kernel lists and may not be freed until it has been ++ * removed from the kernel lists. ++ */ ++ ++void dev_add_pack(struct packet_type *pt) ++{ ++ int hash; ++ ++ br_write_lock_bh(BR_NETPROTO_LOCK); ++ ++#ifdef CONFIG_NET_FASTROUTE ++ /* Hack to detect packet socket */ ++ if ((pt->data) && ((int)(pt->data)!=1)) { ++ netdev_fastroute_obstacles++; ++ dev_clear_fastroute(pt->dev); ++ } ++#endif ++ if (pt->type == htons(ETH_P_ALL)) { ++ netdev_nit++; ++ pt->next=ptype_all; ++ ptype_all=pt; ++ } else { ++ hash=ntohs(pt->type)&15; ++ pt->next = ptype_base[hash]; ++ ptype_base[hash] = pt; ++ } ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++} ++ ++ ++/** ++ * dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ */ ++ ++void dev_remove_pack(struct packet_type *pt) ++{ ++ struct packet_type **pt1; ++ ++ br_write_lock_bh(BR_NETPROTO_LOCK); ++ ++ if (pt->type == htons(ETH_P_ALL)) { ++ netdev_nit--; ++ pt1=&ptype_all; ++ } else { ++ pt1=&ptype_base[ntohs(pt->type)&15]; ++ } ++ ++ for (; (*pt1) != NULL; pt1 = &((*pt1)->next)) { ++ if (pt == (*pt1)) { ++ *pt1 = pt->next; ++#ifdef CONFIG_NET_FASTROUTE ++ if (pt->data) ++ netdev_fastroute_obstacles--; ++#endif ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++ return; ++ } ++ } ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++ printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); ++} ++ ++/****************************************************************************** ++ ++ Device Boot-time Settings Routines ++ ++*******************************************************************************/ ++ ++/* Boot time configuration table */ ++static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; ++ ++/** ++ * netdev_boot_setup_add - add new setup entry ++ * @name: name of the device ++ * @map: configured settings for the device ++ * ++ * Adds new setup entry to the dev_boot_setup list. The function ++ * returns 0 on error and 1 on success. This is a generic routine to ++ * all netdevices. ++ */ ++int netdev_boot_setup_add(char *name, struct ifmap *map) ++{ ++ struct netdev_boot_setup *s; ++ int i; ++ ++ s = dev_boot_setup; ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { ++ if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { ++ memset(s[i].name, 0, sizeof(s[i].name)); ++ strcpy(s[i].name, name); ++ memcpy(&s[i].map, map, sizeof(s[i].map)); ++ break; ++ } ++ } ++ ++ if (i >= NETDEV_BOOT_SETUP_MAX) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * netdev_boot_setup_check - check boot time settings ++ * @dev: the netdevice ++ * ++ * Check boot time settings for the device. ++ * The found settings are set for the device to be used ++ * later in the device probing. ++ * Returns 0 if no settings found, 1 if they are. ++ */ ++int netdev_boot_setup_check(struct net_device *dev) ++{ ++ struct netdev_boot_setup *s; ++ int i; ++ ++ s = dev_boot_setup; ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { ++ if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && ++ !strncmp(dev->name, s[i].name, strlen(s[i].name))) { ++ dev->irq = s[i].map.irq; ++ dev->base_addr = s[i].map.base_addr; ++ dev->mem_start = s[i].map.mem_start; ++ dev->mem_end = s[i].map.mem_end; ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++/* ++ * Saves at boot time configured settings for any netdevice. ++ */ ++int __init netdev_boot_setup(char *str) ++{ ++ int ints[5]; ++ struct ifmap map; ++ ++ str = get_options(str, ARRAY_SIZE(ints), ints); ++ if (!str || !*str) ++ return 0; ++ ++ /* Save settings */ ++ memset(&map, 0, sizeof(map)); ++ if (ints[0] > 0) ++ map.irq = ints[1]; ++ if (ints[0] > 1) ++ map.base_addr = ints[2]; ++ if (ints[0] > 2) ++ map.mem_start = ints[3]; ++ if (ints[0] > 3) ++ map.mem_end = ints[4]; ++ ++ /* Add new entry to the list */ ++ return netdev_boot_setup_add(str, &map); ++} ++ ++__setup("netdev=", netdev_boot_setup); ++ ++/***************************************************************************************** ++ ++ Device Interface Subroutines ++ ++******************************************************************************************/ ++ ++/** ++ * __dev_get_by_name - find a device by its name ++ * @name: name to find ++ * ++ * Find an interface by name. Must be called under RTNL semaphore ++ * or @dev_base_lock. If the name is found a pointer to the device ++ * is returned. If the name is not found then %NULL is returned. The ++ * reference counters are not incremented so the caller must be ++ * careful with locks. ++ */ ++ ++ ++struct net_device *__dev_get_by_name(const char *name) ++{ ++ struct net_device *dev; ++ ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (strncmp(dev->name, name, IFNAMSIZ) == 0) ++ return dev; ++ } ++ return NULL; ++} ++ ++/** ++ * dev_get_by_name - find a device by its name ++ * @name: name to find ++ * ++ * Find an interface by name. This can be called from any ++ * context and does its own locking. The returned handle has ++ * the usage count incremented and the caller must use dev_put() to ++ * release it when it is no longer needed. %NULL is returned if no ++ * matching device is found. ++ */ ++ ++struct net_device *dev_get_by_name(const char *name) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(name); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/* ++ Return value is changed to int to prevent illegal usage in future. ++ It is still legal to use to check for device existence. ++ ++ User should understand, that the result returned by this function ++ is meaningless, if it was not issued under rtnl semaphore. ++ */ ++ ++/** ++ * dev_get - test if a device exists ++ * @name: name to test for ++ * ++ * Test if a name exists. Returns true if the name is found. In order ++ * to be sure the name is not allocated or removed during the test the ++ * caller must hold the rtnl semaphore. ++ * ++ * This function primarily exists for back compatibility with older ++ * drivers. ++ */ ++ ++int dev_get(const char *name) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(name); ++ read_unlock(&dev_base_lock); ++ return dev != NULL; ++} ++ ++/** ++ * __dev_get_by_index - find a device by its ifindex ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not ++ * had its reference counter increased so the caller must be careful ++ * about locking. The caller must hold either the RTNL semaphore ++ * or @dev_base_lock. ++ */ ++ ++struct net_device * __dev_get_by_index(int ifindex) ++{ ++ struct net_device *dev; ++ ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (dev->ifindex == ifindex) ++ return dev; ++ } ++ return NULL; ++} ++ ++ ++/** ++ * dev_get_by_index - find a device by its ifindex ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns NULL if the device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device * dev_get_by_index(int ifindex) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_index(ifindex); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * dev_getbyhwaddr - find a device by its hardware address ++ * @type: media type of device ++ * @ha: hardware address ++ * ++ * Search for an interface by MAC address. Returns NULL if the device ++ * is not found or a pointer to the device. The caller must hold the ++ * rtnl semaphore. The returned device has not had its ref count increased ++ * and the caller must therefore be careful about locking ++ * ++ * BUGS: ++ * If the API was consistent this would be __dev_get_by_hwaddr ++ */ ++ ++struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) ++{ ++ struct net_device *dev; ++ ++ ASSERT_RTNL(); ++ ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (dev->type == type && ++ memcmp(dev->dev_addr, ha, dev->addr_len) == 0) ++ return dev; ++ } ++ return NULL; ++} ++ ++/** ++ * dev_get_by_flags - find any device with given flags ++ * @if_flags: IFF_* values ++ * @mask: bitmask of bits in if_flags to check ++ * ++ * Search for any interface with the given flags. Returns NULL if a device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_flags(if_flags, mask); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * __dev_get_by_flags - find any device with given flags ++ * @if_flags: IFF_* values ++ * @mask: bitmask of bits in if_flags to check ++ * ++ * Search for any interface with the given flags. Returns NULL if a device ++ * is not found or a pointer to the device. The caller must hold either ++ * the RTNL semaphore or @dev_base_lock. ++ */ ++ ++struct net_device *__dev_get_by_flags(unsigned short if_flags, unsigned short mask) ++{ ++ struct net_device *dev; ++ ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (((dev->flags ^ if_flags) & mask) == 0) ++ return dev; ++ } ++ return NULL; ++} ++ ++/** ++ * dev_alloc_name - allocate a name for a device ++ * @dev: device ++ * @name: name format string ++ * ++ * Passed a format string - eg "lt%d" it will try and find a suitable ++ * id. Not efficient for many devices, not called a lot. The caller ++ * must hold the dev_base or rtnl lock while allocating the name and ++ * adding the device in order to avoid duplicates. Returns the number ++ * of the unit assigned or a negative errno code. ++ */ ++ ++int dev_alloc_name(struct net_device *dev, const char *name) ++{ ++ int i; ++ char buf[32]; ++ char *p; ++ ++ /* ++ * Verify the string as this thing may have come from ++ * the user. There must be either one "%d" and no other "%" ++ * characters, or no "%" characters at all. ++ */ ++ p = strchr(name, '%'); ++ if (p && (p[1] != 'd' || strchr(p+2, '%'))) ++ return -EINVAL; ++ ++ /* ++ * If you need over 100 please also fix the algorithm... ++ */ ++ for (i = 0; i < 100; i++) { ++ snprintf(buf,sizeof(buf),name,i); ++ if (__dev_get_by_name(buf) == NULL) { ++ strcpy(dev->name, buf); ++ return i; ++ } ++ } ++ return -ENFILE; /* Over 100 of the things .. bail out! */ ++} ++ ++/** ++ * dev_alloc - allocate a network device and name ++ * @name: name format string ++ * @err: error return pointer ++ * ++ * Passed a format string, eg. "lt%d", it will allocate a network device ++ * and space for the name. %NULL is returned if no memory is available. ++ * If the allocation succeeds then the name is assigned and the ++ * device pointer returned. %NULL is returned if the name allocation ++ * failed. The cause of an error is returned as a negative errno code ++ * in the variable @err points to. ++ * ++ * The caller must hold the @dev_base or RTNL locks when doing this in ++ * order to avoid duplicate name allocations. ++ */ ++ ++struct net_device *dev_alloc(const char *name, int *err) ++{ ++ struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL); ++ if (dev == NULL) { ++ *err = -ENOBUFS; ++ return NULL; ++ } ++ memset(dev, 0, sizeof(struct net_device)); ++ *err = dev_alloc_name(dev, name); ++ if (*err < 0) { ++ kfree(dev); ++ return NULL; ++ } ++ return dev; ++} ++ ++/** ++ * netdev_state_change - device changes state ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed state. This function calls ++ * the notifier chains for netdev_chain and sends a NEWLINK message ++ * to the routing socket. ++ */ ++ ++void netdev_state_change(struct net_device *dev) ++{ ++ if (dev->flags&IFF_UP) { ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); ++ rtmsg_ifinfo(RTM_NEWLINK, dev, 0); ++ } ++} ++ ++ ++#ifdef CONFIG_KMOD ++ ++/** ++ * dev_load - load a network module ++ * @name: name of interface ++ * ++ * If a network interface is not present and the process has suitable ++ * privileges this function loads the module. If module loading is not ++ * available in this kernel then it becomes a nop. ++ */ ++ ++void dev_load(const char *name) ++{ ++ if (!dev_get(name) && capable(CAP_SYS_MODULE)) ++ request_module(name); ++} ++ ++#else ++ ++extern inline void dev_load(const char *unused){;} ++ ++#endif ++ ++static int default_rebuild_header(struct sk_buff *skb) ++{ ++ printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!"); ++ kfree_skb(skb); ++ return 1; ++} ++ ++/** ++ * dev_open - prepare an interface for use. ++ * @dev: device to open ++ * ++ * Takes a device from down to up state. The device's private open ++ * function is invoked and then the multicast lists are loaded. Finally ++ * the device is moved into the up state and a %NETDEV_UP message is ++ * sent to the netdev notifier chain. ++ * ++ * Calling this function on an active interface is a nop. On a failure ++ * a negative errno code is returned. ++ */ ++ ++int dev_open(struct net_device *dev) ++{ ++ int ret = 0; ++ ++ /* ++ * Is it already up? ++ */ ++ ++ if (dev->flags&IFF_UP) ++ return 0; ++ ++ /* ++ * Is it even present? ++ */ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ /* ++ * Call device private open method ++ */ ++ if (try_inc_mod_count(dev->owner)) { ++ set_bit(__LINK_STATE_START, &dev->state); ++ if (dev->open) { ++ ret = dev->open(dev); ++ if (ret != 0) { ++ clear_bit(__LINK_STATE_START, &dev->state); ++ if (dev->owner) ++ __MOD_DEC_USE_COUNT(dev->owner); ++ } ++ } ++ } else { ++ ret = -ENODEV; ++ } ++ ++ /* ++ * If it went open OK then: ++ */ ++ ++ if (ret == 0) ++ { ++ /* ++ * Set the flags. ++ */ ++ dev->flags |= IFF_UP; ++ ++ /* ++ * Initialize multicasting status ++ */ ++ dev_mc_upload(dev); ++ ++ /* ++ * Wakeup transmit queue engine ++ */ ++ dev_activate(dev); ++ ++ /* ++ * ... and announce new interface. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_UP, dev); ++ } ++ return(ret); ++} ++ ++#ifdef CONFIG_NET_FASTROUTE ++ ++static void dev_do_clear_fastroute(struct net_device *dev) ++{ ++ if (dev->accept_fastpath) { ++ int i; ++ ++ for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) { ++ struct dst_entry *dst; ++ ++ write_lock_irq(&dev->fastpath_lock); ++ dst = dev->fastpath[i]; ++ dev->fastpath[i] = NULL; ++ write_unlock_irq(&dev->fastpath_lock); ++ ++ dst_release(dst); ++ } ++ } ++} ++ ++void dev_clear_fastroute(struct net_device *dev) ++{ ++ if (dev) { ++ dev_do_clear_fastroute(dev); ++ } else { ++ read_lock(&dev_base_lock); ++ for (dev = dev_base; dev; dev = dev->next) ++ dev_do_clear_fastroute(dev); ++ read_unlock(&dev_base_lock); ++ } ++} ++#endif ++ ++/** ++ * dev_close - shutdown an interface. ++ * @dev: device to shutdown ++ * ++ * This function moves an active device into down state. A ++ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device ++ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier ++ * chain. ++ */ ++ ++int dev_close(struct net_device *dev) ++{ ++ if (!(dev->flags&IFF_UP)) ++ return 0; ++ ++ /* ++ * Tell people we are going down, so that they can ++ * prepare to death, when device is still operating. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); ++ ++ dev_deactivate(dev); ++ ++ clear_bit(__LINK_STATE_START, &dev->state); ++ ++ /* Synchronize to scheduled poll. We cannot touch poll list, ++ * it can be even on different cpu. So just clear netif_running(), ++ * and wait when poll really will happen. Actually, the best place ++ * for this is inside dev->stop() after device stopped its irq ++ * engine, but this requires more changes in devices. */ ++ ++ smp_mb__after_clear_bit(); /* Commit netif_running(). */ ++ while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { ++ /* No hurry. */ ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(1); ++ } ++ ++ /* ++ * Call the device specific close. This cannot fail. ++ * Only if device is UP ++ * ++ * We allow it to be called even after a DETACH hot-plug ++ * event. ++ */ ++ ++ if (dev->stop) ++ dev->stop(dev); ++ ++ /* ++ * Device is now down. ++ */ ++ ++ dev->flags &= ~IFF_UP; ++#ifdef CONFIG_NET_FASTROUTE ++ dev_clear_fastroute(dev); ++#endif ++ ++ /* ++ * Tell people we are down ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); ++ ++ /* ++ * Drop the module refcount ++ */ ++ if (dev->owner) ++ __MOD_DEC_USE_COUNT(dev->owner); ++ ++ return(0); ++} ++ ++ ++/* ++ * Device change register/unregister. These are not inline or static ++ * as we export them to the world. ++ */ ++ ++/** ++ * register_netdevice_notifier - register a network notifier block ++ * @nb: notifier ++ * ++ * Register a notifier to be called when network device events occur. ++ * The notifier passed is linked into the kernel structures and must ++ * not be reused until it has been unregistered. A negative errno code ++ * is returned on a failure. ++ */ ++ ++int register_netdevice_notifier(struct notifier_block *nb) ++{ ++ return notifier_chain_register(&netdev_chain, nb); ++} ++ ++/** ++ * unregister_netdevice_notifier - unregister a network notifier block ++ * @nb: notifier ++ * ++ * Unregister a notifier previously registered by ++ * register_netdevice_notifier(). The notifier is unlinked into the ++ * kernel structures and may then be reused. A negative errno code ++ * is returned on a failure. ++ */ ++ ++int unregister_netdevice_notifier(struct notifier_block *nb) ++{ ++ return notifier_chain_unregister(&netdev_chain,nb); ++} ++ ++/* ++ * Support routine. Sends outgoing frames to any network ++ * taps currently in use. ++ */ ++ ++void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct packet_type *ptype; ++ do_gettimeofday(&skb->stamp); ++ ++ br_read_lock(BR_NETPROTO_LOCK); ++ for (ptype = ptype_all; ptype!=NULL; ptype = ptype->next) ++ { ++ /* Never send packets back to the socket ++ * they originated from - MvS (miquels@drinkel.ow.org) ++ */ ++ if ((ptype->dev == dev || !ptype->dev) && ++ ((struct sock *)ptype->data != skb->sk)) ++ { ++ struct sk_buff *skb2; ++ if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) ++ break; ++ ++ /* skb->nh should be correctly ++ set by sender, so that the second statement is ++ just protection against buggy protocols. ++ */ ++ skb2->mac.raw = skb2->data; ++ ++ if (skb2->nh.raw < skb2->data || skb2->nh.raw > skb2->tail) { ++ if (net_ratelimit()) ++ printk(KERN_CRIT "protocol %04x is buggy, dev %s\n", skb2->protocol, dev->name); ++ skb2->nh.raw = skb2->data; ++ } ++ ++ skb2->h.raw = skb2->nh.raw; ++ skb2->pkt_type = PACKET_OUTGOING; ++ ptype->func(skb2, skb->dev, ptype); ++ } ++ } ++ br_read_unlock(BR_NETPROTO_LOCK); ++} ++ ++/* Calculate csum in the case, when packet is misrouted. ++ * If it failed by some reason, ignore and send skb with wrong ++ * checksum. ++ */ ++struct sk_buff * skb_checksum_help(struct sk_buff *skb) ++{ ++ int offset; ++ unsigned int csum; ++ ++ offset = skb->h.raw - skb->data; ++ if (offset > (int)skb->len) ++ BUG(); ++ csum = skb_checksum(skb, offset, skb->len-offset, 0); ++ ++ offset = skb->tail - skb->h.raw; ++ if (offset <= 0) ++ BUG(); ++ if (skb->csum+2 > offset) ++ BUG(); ++ ++ *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); ++ skb->ip_summed = CHECKSUM_NONE; ++ return skb; ++} ++ ++#ifdef CONFIG_HIGHMEM ++/* Actually, we should eliminate this check as soon as we know, that: ++ * 1. IOMMU is present and allows to map all the memory. ++ * 2. No high memory really exists on this machine. ++ */ ++ ++static inline int ++illegal_highdma(struct net_device *dev, struct sk_buff *skb) ++{ ++ int i; ++ ++ if (dev->features&NETIF_F_HIGHDMA) ++ return 0; ++ ++ for (i=0; inr_frags; i++) ++ if (skb_shinfo(skb)->frags[i].page >= highmem_start_page) ++ return 1; ++ ++ return 0; ++} ++#else ++#define illegal_highdma(dev, skb) (0) ++#endif ++ ++/** ++ * dev_queue_xmit - transmit a buffer ++ * @skb: buffer to transmit ++ * ++ * Queue a buffer for transmission to a network device. The caller must ++ * have set the device and priority and built the buffer before calling this ++ * function. The function can be called from an interrupt. ++ * ++ * A negative errno code is returned on a failure. A success does not ++ * guarantee the frame will be transmitted as it may be dropped due ++ * to congestion or traffic shaping. ++ */ ++ ++int dev_queue_xmit(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ struct Qdisc *q; ++ ++ if (skb_shinfo(skb)->frag_list && ++ !(dev->features&NETIF_F_FRAGLIST) && ++ skb_linearize(skb, GFP_ATOMIC) != 0) { ++ kfree_skb(skb); ++ return -ENOMEM; ++ } ++ ++ /* Fragmented skb is linearized if device does not support SG, ++ * or if at least one of fragments is in highmem and device ++ * does not support DMA from it. ++ */ ++ if (skb_shinfo(skb)->nr_frags && ++ (!(dev->features&NETIF_F_SG) || illegal_highdma(dev, skb)) && ++ skb_linearize(skb, GFP_ATOMIC) != 0) { ++ kfree_skb(skb); ++ return -ENOMEM; ++ } ++ ++ /* If packet is not checksummed and device does not support ++ * checksumming for this protocol, complete checksumming here. ++ */ ++ if (skb->ip_summed == CHECKSUM_HW && ++ (!(dev->features&(NETIF_F_HW_CSUM|NETIF_F_NO_CSUM)) && ++ (!(dev->features&NETIF_F_IP_CSUM) || ++ skb->protocol != htons(ETH_P_IP)))) { ++ if ((skb = skb_checksum_help(skb)) == NULL) ++ return -ENOMEM; ++ } ++ ++ /* Grab device queue */ ++ spin_lock_bh(&dev->queue_lock); ++ q = dev->qdisc; ++ if (q->enqueue) { ++ int ret = q->enqueue(skb, q); ++ ++ qdisc_run(dev); ++ ++ spin_unlock_bh(&dev->queue_lock); ++ return ret == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : ret; ++ } ++ ++ /* The device has no queue. Common case for software devices: ++ loopback, all the sorts of tunnels... ++ ++ Really, it is unlikely that xmit_lock protection is necessary here. ++ (f.e. loopback and IP tunnels are clean ignoring statistics counters.) ++ However, it is possible, that they rely on protection ++ made by us here. ++ ++ Check this and shot the lock. It is not prone from deadlocks. ++ Either shot noqueue qdisc, it is even simpler 8) ++ */ ++ if (dev->flags&IFF_UP) { ++ int cpu = smp_processor_id(); ++ ++ if (dev->xmit_lock_owner != cpu) { ++ spin_unlock(&dev->queue_lock); ++ spin_lock(&dev->xmit_lock); ++ dev->xmit_lock_owner = cpu; ++ ++ if (!netif_queue_stopped(dev)) { ++ if (netdev_nit) ++ dev_queue_xmit_nit(skb,dev); ++ ++ if (dev->hard_start_xmit(skb, dev) == 0) { ++ dev->xmit_lock_owner = -1; ++ spin_unlock_bh(&dev->xmit_lock); ++ return 0; ++ } ++ } ++ dev->xmit_lock_owner = -1; ++ spin_unlock_bh(&dev->xmit_lock); ++ if (net_ratelimit()) ++ printk(KERN_CRIT "Virtual device %s asks to queue packet!\n", dev->name); ++ kfree_skb(skb); ++ return -ENETDOWN; ++ } else { ++ /* Recursion is detected! It is possible, unfortunately */ ++ if (net_ratelimit()) ++ printk(KERN_CRIT "Dead loop on virtual device %s, fix it urgently!\n", dev->name); ++ } ++ } ++ spin_unlock_bh(&dev->queue_lock); ++ ++ kfree_skb(skb); ++ return -ENETDOWN; ++} ++ ++ ++/*======================================================================= ++ Receiver routines ++ =======================================================================*/ ++ ++int netdev_max_backlog = 300; ++int weight_p = 64; /* old backlog weight */ ++/* These numbers are selected based on intuition and some ++ * experimentatiom, if you have more scientific way of doing this ++ * please go ahead and fix things. ++ */ ++int no_cong_thresh = 10; ++int no_cong = 20; ++int lo_cong = 100; ++int mod_cong = 290; ++ ++struct netif_rx_stats netdev_rx_stat[NR_CPUS]; ++ ++ ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++atomic_t netdev_dropping = ATOMIC_INIT(0); ++static unsigned long netdev_fc_mask = 1; ++unsigned long netdev_fc_xoff = 0; ++spinlock_t netdev_fc_lock = SPIN_LOCK_UNLOCKED; ++ ++static struct ++{ ++ void (*stimul)(struct net_device *); ++ struct net_device *dev; ++} netdev_fc_slots[BITS_PER_LONG]; ++ ++int netdev_register_fc(struct net_device *dev, void (*stimul)(struct net_device *dev)) ++{ ++ int bit = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&netdev_fc_lock, flags); ++ if (netdev_fc_mask != ~0UL) { ++ bit = ffz(netdev_fc_mask); ++ netdev_fc_slots[bit].stimul = stimul; ++ netdev_fc_slots[bit].dev = dev; ++ set_bit(bit, &netdev_fc_mask); ++ clear_bit(bit, &netdev_fc_xoff); ++ } ++ spin_unlock_irqrestore(&netdev_fc_lock, flags); ++ return bit; ++} ++ ++void netdev_unregister_fc(int bit) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&netdev_fc_lock, flags); ++ if (bit > 0) { ++ netdev_fc_slots[bit].stimul = NULL; ++ netdev_fc_slots[bit].dev = NULL; ++ clear_bit(bit, &netdev_fc_mask); ++ clear_bit(bit, &netdev_fc_xoff); ++ } ++ spin_unlock_irqrestore(&netdev_fc_lock, flags); ++} ++ ++static void netdev_wakeup(void) ++{ ++ unsigned long xoff; ++ ++ spin_lock(&netdev_fc_lock); ++ xoff = netdev_fc_xoff; ++ netdev_fc_xoff = 0; ++ while (xoff) { ++ int i = ffz(~xoff); ++ xoff &= ~(1<> 1)+ (blog >> 1); ++ ++ if (avg_blog > mod_cong) { ++ /* Above moderate congestion levels. */ ++ softnet_data[cpu].cng_level = NET_RX_CN_HIGH; ++#ifdef RAND_LIE ++ rd = net_random(); ++ rq = rd % netdev_max_backlog; ++ if (rq < avg_blog) /* unlucky bastard */ ++ softnet_data[cpu].cng_level = NET_RX_DROP; ++#endif ++ } else if (avg_blog > lo_cong) { ++ softnet_data[cpu].cng_level = NET_RX_CN_MOD; ++#ifdef RAND_LIE ++ rd = net_random(); ++ rq = rd % netdev_max_backlog; ++ if (rq < avg_blog) /* unlucky bastard */ ++ softnet_data[cpu].cng_level = NET_RX_CN_HIGH; ++#endif ++ } else if (avg_blog > no_cong) ++ softnet_data[cpu].cng_level = NET_RX_CN_LOW; ++ else /* no congestion */ ++ softnet_data[cpu].cng_level = NET_RX_SUCCESS; ++ ++ softnet_data[cpu].avg_blog = avg_blog; ++} ++ ++#ifdef OFFLINE_SAMPLE ++static void sample_queue(unsigned long dummy) ++{ ++/* 10 ms 0r 1ms -- i dont care -- JHS */ ++ int next_tick = 1; ++ int cpu = smp_processor_id(); ++ ++ get_sample_stats(cpu); ++ next_tick += jiffies; ++ mod_timer(&samp_timer, next_tick); ++} ++#endif ++ ++ ++/** ++ * netif_rx - post buffer to the network code ++ * @skb: buffer to post ++ * ++ * This function receives a packet from a device driver and queues it for ++ * the upper (protocol) levels to process. It always succeeds. The buffer ++ * may be dropped during processing for congestion control or by the ++ * protocol layers. ++ * ++ * return values: ++ * NET_RX_SUCCESS (no congestion) ++ * NET_RX_CN_LOW (low congestion) ++ * NET_RX_CN_MOD (moderate congestion) ++ * NET_RX_CN_HIGH (high congestion) ++ * NET_RX_DROP (packet was dropped) ++ * ++ * ++ */ ++ ++int netif_rx(struct sk_buff *skb) ++{ ++ int this_cpu = smp_processor_id(); ++ struct softnet_data *queue; ++ unsigned long flags; ++ ++ if (skb->stamp.tv_sec == 0) ++ do_gettimeofday(&skb->stamp); ++ ++ /* The code is rearranged so that the path is the most ++ short when CPU is congested, but is still operating. ++ */ ++ queue = &softnet_data[this_cpu]; ++ ++ local_irq_save(flags); ++ ++ netdev_rx_stat[this_cpu].total++; ++ if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { ++ if (queue->input_pkt_queue.qlen) { ++ if (queue->throttle) ++ goto drop; ++ ++enqueue: ++ dev_hold(skb->dev); ++ __skb_queue_tail(&queue->input_pkt_queue,skb); ++ local_irq_restore(flags); ++#ifndef OFFLINE_SAMPLE ++ get_sample_stats(this_cpu); ++#endif ++ return queue->cng_level; ++ } ++ ++ if (queue->throttle) { ++ queue->throttle = 0; ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++ if (atomic_dec_and_test(&netdev_dropping)) ++ netdev_wakeup(); ++#endif ++ } ++ ++ netif_rx_schedule(&queue->blog_dev); ++ goto enqueue; ++ } ++ ++ if (queue->throttle == 0) { ++ queue->throttle = 1; ++ netdev_rx_stat[this_cpu].throttled++; ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++ atomic_inc(&netdev_dropping); ++#endif ++ } ++ ++drop: ++ netdev_rx_stat[this_cpu].dropped++; ++ local_irq_restore(flags); ++ ++ kfree_skb(skb); ++ return NET_RX_DROP; ++} ++ ++/* Deliver skb to an old protocol, which is not threaded well ++ or which do not understand shared skbs. ++ */ ++static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last) ++{ ++ static spinlock_t net_bh_lock = SPIN_LOCK_UNLOCKED; ++ int ret = NET_RX_DROP; ++ ++ ++ if (!last) { ++ skb = skb_clone(skb, GFP_ATOMIC); ++ if (skb == NULL) ++ return ret; ++ } ++ if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) { ++ kfree_skb(skb); ++ return ret; ++ } ++ ++ /* The assumption (correct one) is that old protocols ++ did not depened on BHs different of NET_BH and TIMER_BH. ++ */ ++ ++ /* Emulate NET_BH with special spinlock */ ++ spin_lock(&net_bh_lock); ++ ++ /* Disable timers and wait for all timers completion */ ++ tasklet_disable(bh_task_vec+TIMER_BH); ++ ++ ret = pt->func(skb, skb->dev, pt); ++ ++ tasklet_hi_enable(bh_task_vec+TIMER_BH); ++ spin_unlock(&net_bh_lock); ++ return ret; ++} ++ ++static __inline__ void skb_bond(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ ++ if (dev->master) { ++ skb->real_dev = skb->dev; ++ skb->dev = dev->master; ++ } ++} ++ ++static void net_tx_action(struct softirq_action *h) ++{ ++ int cpu = smp_processor_id(); ++ ++ if (softnet_data[cpu].completion_queue) { ++ struct sk_buff *clist; ++ ++ local_irq_disable(); ++ clist = softnet_data[cpu].completion_queue; ++ softnet_data[cpu].completion_queue = NULL; ++ local_irq_enable(); ++ ++ while (clist != NULL) { ++ struct sk_buff *skb = clist; ++ clist = clist->next; ++ ++ BUG_TRAP(atomic_read(&skb->users) == 0); ++ __kfree_skb(skb); ++ } ++ } ++ ++ if (softnet_data[cpu].output_queue) { ++ struct net_device *head; ++ ++ local_irq_disable(); ++ head = softnet_data[cpu].output_queue; ++ softnet_data[cpu].output_queue = NULL; ++ local_irq_enable(); ++ ++ while (head != NULL) { ++ struct net_device *dev = head; ++ head = head->next_sched; ++ ++ smp_mb__before_clear_bit(); ++ clear_bit(__LINK_STATE_SCHED, &dev->state); ++ ++ if (spin_trylock(&dev->queue_lock)) { ++ qdisc_run(dev); ++ spin_unlock(&dev->queue_lock); ++ } else { ++ netif_schedule(dev); ++ } ++ } ++ } ++} ++ ++ ++#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) ++void (*br_handle_frame_hook)(struct sk_buff *skb) = NULL; ++#endif ++ ++static __inline__ int handle_bridge(struct sk_buff *skb, ++ struct packet_type *pt_prev) ++{ ++ int ret = NET_RX_DROP; ++ ++ if (pt_prev) { ++ if (!pt_prev->data) ++ ret = deliver_to_old_ones(pt_prev, skb, 0); ++ else { ++ atomic_inc(&skb->users); ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } ++ } ++ ++ br_handle_frame_hook(skb); ++ return ret; ++} ++ ++ ++#ifdef CONFIG_NET_DIVERT ++static inline int handle_diverter(struct sk_buff *skb) ++{ ++ /* if diversion is supported on device, then divert */ ++ if (skb->dev->divert && skb->dev->divert->divert) ++ divert_frame(skb); ++ return 0; ++} ++#endif /* CONFIG_NET_DIVERT */ ++ ++int netif_receive_skb(struct sk_buff *skb) ++{ ++ struct packet_type *ptype, *pt_prev; ++ int ret = NET_RX_DROP; ++ unsigned short type; ++ ++ if (skb->stamp.tv_sec == 0) ++ do_gettimeofday(&skb->stamp); ++ ++ skb_bond(skb); ++ ++ netdev_rx_stat[smp_processor_id()].total++; ++ ++#ifdef CONFIG_NET_FASTROUTE ++ if (skb->pkt_type == PACKET_FASTROUTE) { ++ netdev_rx_stat[smp_processor_id()].fastroute_deferred_out++; ++ return dev_queue_xmit(skb); ++ } ++#endif ++ ++ skb->h.raw = skb->nh.raw = skb->data; ++ ++ pt_prev = NULL; ++ for (ptype = ptype_all; ptype; ptype = ptype->next) { ++ if (!ptype->dev || ptype->dev == skb->dev) { ++ if (pt_prev) { ++ if (!pt_prev->data) { ++ ret = deliver_to_old_ones(pt_prev, skb, 0); ++ } else { ++ atomic_inc(&skb->users); ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } ++ } ++ pt_prev = ptype; ++ } ++ } ++ ++#ifdef CONFIG_NET_DIVERT ++ if (skb->dev->divert && skb->dev->divert->divert) ++ ret = handle_diverter(skb); ++#endif /* CONFIG_NET_DIVERT */ ++ ++#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) ++ if (skb->dev->br_port != NULL && br_handle_frame_hook != NULL && ++ skb->pkt_type != PACKET_LOOPBACK) { ++ return handle_bridge(skb, pt_prev); ++ } ++#endif ++ ++ type = skb->protocol; ++ for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) { ++ if (ptype->type == type && ++ (!ptype->dev || ptype->dev == skb->dev)) { ++ if (pt_prev) { ++ if (!pt_prev->data) { ++ ret = deliver_to_old_ones(pt_prev, skb, 0); ++ } else { ++ atomic_inc(&skb->users); ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } ++ } ++ pt_prev = ptype; ++ } ++ } ++ ++ if (pt_prev) { ++ if (!pt_prev->data) { ++ ret = deliver_to_old_ones(pt_prev, skb, 1); ++ } else { ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } ++ } else { ++ kfree_skb(skb); ++ /* Jamal, now you will not able to escape explaining ++ * me how you were going to use this. :-) ++ */ ++ ret = NET_RX_DROP; ++ } ++ ++ return ret; ++} ++ ++static int process_backlog(struct net_device *backlog_dev, int *budget) ++{ ++ int work = 0; ++ int quota = min(backlog_dev->quota, *budget); ++ int this_cpu = smp_processor_id(); ++ struct softnet_data *queue = &softnet_data[this_cpu]; ++ unsigned long start_time = jiffies; ++ ++ for (;;) { ++ struct sk_buff *skb; ++ struct net_device *dev; ++ ++ local_irq_disable(); ++ skb = __skb_dequeue(&queue->input_pkt_queue); ++ if (skb == NULL) ++ goto job_done; ++ local_irq_enable(); ++ ++ dev = skb->dev; ++ ++ netif_receive_skb(skb); ++ ++ dev_put(dev); ++ ++ work++; ++ ++ if (work >= quota || jiffies - start_time > 1) ++ break; ++ ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++ if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) { ++ queue->throttle = 0; ++ if (atomic_dec_and_test(&netdev_dropping)) { ++ netdev_wakeup(); ++ break; ++ } ++ } ++#endif ++ } ++ ++ backlog_dev->quota -= work; ++ *budget -= work; ++ return -1; ++ ++job_done: ++ backlog_dev->quota -= work; ++ *budget -= work; ++ ++ list_del(&backlog_dev->poll_list); ++ smp_mb__before_clear_bit(); ++ netif_poll_enable(backlog_dev); ++ ++ if (queue->throttle) { ++ queue->throttle = 0; ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++ if (atomic_dec_and_test(&netdev_dropping)) ++ netdev_wakeup(); ++#endif ++ } ++ local_irq_enable(); ++ return 0; ++} ++ ++static void net_rx_action(struct softirq_action *h) ++{ ++ int this_cpu = smp_processor_id(); ++ struct softnet_data *queue = &softnet_data[this_cpu]; ++ unsigned long start_time = jiffies; ++ int budget = netdev_max_backlog; ++ ++ br_read_lock(BR_NETPROTO_LOCK); ++ local_irq_disable(); ++ ++ while (!list_empty(&queue->poll_list)) { ++ struct net_device *dev; ++ ++ if (budget <= 0 || jiffies - start_time > 1) ++ goto softnet_break; ++ ++ local_irq_enable(); ++ ++ dev = list_entry(queue->poll_list.next, struct net_device, poll_list); ++ ++ if (dev->quota <= 0 || dev->poll(dev, &budget)) { ++ local_irq_disable(); ++ list_del(&dev->poll_list); ++ list_add_tail(&dev->poll_list, &queue->poll_list); ++ if (dev->quota < 0) ++ dev->quota += dev->weight; ++ else ++ dev->quota = dev->weight; ++ } else { ++ dev_put(dev); ++ local_irq_disable(); ++ } ++ } ++ ++ local_irq_enable(); ++ br_read_unlock(BR_NETPROTO_LOCK); ++ return; ++ ++softnet_break: ++ netdev_rx_stat[this_cpu].time_squeeze++; ++ __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ); ++ ++ local_irq_enable(); ++ br_read_unlock(BR_NETPROTO_LOCK); ++} ++ ++static gifconf_func_t * gifconf_list [NPROTO]; ++ ++/** ++ * register_gifconf - register a SIOCGIF handler ++ * @family: Address family ++ * @gifconf: Function handler ++ * ++ * Register protocol dependent address dumping routines. The handler ++ * that is passed must not be freed or reused until it has been replaced ++ * by another handler. ++ */ ++ ++int register_gifconf(unsigned int family, gifconf_func_t * gifconf) ++{ ++ if (family>=NPROTO) ++ return -EINVAL; ++ gifconf_list[family] = gifconf; ++ return 0; ++} ++ ++ ++/* ++ * Map an interface index to its name (SIOCGIFNAME) ++ */ ++ ++/* ++ * We need this ioctl for efficient implementation of the ++ * if_indextoname() function required by the IPv6 API. Without ++ * it, we would have to search all the interfaces to find a ++ * match. --pb ++ */ ++ ++static int dev_ifname(struct ifreq *arg) ++{ ++ struct net_device *dev; ++ struct ifreq ifr; ++ ++ /* ++ * Fetch the caller's info block. ++ */ ++ ++ if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) ++ return -EFAULT; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_index(ifr.ifr_ifindex); ++ if (!dev) { ++ read_unlock(&dev_base_lock); ++ return -ENODEV; ++ } ++ ++ strcpy(ifr.ifr_name, dev->name); ++ read_unlock(&dev_base_lock); ++ ++ if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ return 0; ++} ++ ++/* ++ * Perform a SIOCGIFCONF call. This structure will change ++ * size eventually, and there is nothing I can do about it. ++ * Thus we will need a 'compatibility mode'. ++ */ ++ ++static int dev_ifconf(char *arg) ++{ ++ struct ifconf ifc; ++ struct net_device *dev; ++ char *pos; ++ int len; ++ int total; ++ int i; ++ ++ /* ++ * Fetch the caller's info block. ++ */ ++ ++ if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) ++ return -EFAULT; ++ ++ pos = ifc.ifc_buf; ++ len = ifc.ifc_len; ++ ++ /* ++ * Loop over the interfaces, and write an info block for each. ++ */ ++ ++ total = 0; ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ for (i=0; iget_stats ? dev->get_stats(dev): NULL); ++ int size; ++ ++ if (stats) ++ size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", ++ dev->name, ++ stats->rx_bytes, ++ stats->rx_packets, stats->rx_errors, ++ stats->rx_dropped + stats->rx_missed_errors, ++ stats->rx_fifo_errors, ++ stats->rx_length_errors + stats->rx_over_errors ++ + stats->rx_crc_errors + stats->rx_frame_errors, ++ stats->rx_compressed, stats->multicast, ++ stats->tx_bytes, ++ stats->tx_packets, stats->tx_errors, stats->tx_dropped, ++ stats->tx_fifo_errors, stats->collisions, ++ stats->tx_carrier_errors + stats->tx_aborted_errors ++ + stats->tx_window_errors + stats->tx_heartbeat_errors, ++ stats->tx_compressed); ++ else ++ size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); ++ ++ return size; ++} ++ ++/* ++ * Called from the PROCfs module. This now uses the new arbitrary sized /proc/net interface ++ * to create /proc/net/dev ++ */ ++ ++static int dev_get_info(char *buffer, char **start, off_t offset, int length) ++{ ++ int len = 0; ++ off_t begin = 0; ++ off_t pos = 0; ++ int size; ++ struct net_device *dev; ++ ++ ++ size = sprintf(buffer, ++ "Inter-| Receive | Transmit\n" ++ " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n"); ++ ++ pos += size; ++ len += size; ++ ++ ++ read_lock(&dev_base_lock); ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ size = sprintf_stats(buffer+len, dev); ++ len += size; ++ pos = begin + len; ++ ++ if (pos < offset) { ++ len = 0; ++ begin = pos; ++ } ++ if (pos > offset + length) ++ break; ++ } ++ read_unlock(&dev_base_lock); ++ ++ *start = buffer + (offset - begin); /* Start of wanted data */ ++ len -= (offset - begin); /* Start slop */ ++ if (len > length) ++ len = length; /* Ending slop */ ++ if (len < 0) ++ len = 0; ++ return len; ++} ++ ++static int dev_proc_stats(char *buffer, char **start, off_t offset, ++ int length, int *eof, void *data) ++{ ++ int i, lcpu; ++ int len=0; ++ ++ for (lcpu=0; lcpu length) ++ len = length; ++ if (len < 0) ++ len = 0; ++ ++ *start = buffer + offset; ++ *eof = 1; ++ ++ return len; ++} ++ ++#endif /* CONFIG_PROC_FS */ ++ ++ ++/** ++ * netdev_set_master - set up master/slave pair ++ * @slave: slave device ++ * @master: new master device ++ * ++ * Changes the master device of the slave. Pass %NULL to break the ++ * bonding. The caller must hold the RTNL semaphore. On a failure ++ * a negative errno code is returned. On success the reference counts ++ * are adjusted, %RTM_NEWLINK is sent to the routing socket and the ++ * function returns zero. ++ */ ++ ++int netdev_set_master(struct net_device *slave, struct net_device *master) ++{ ++ struct net_device *old = slave->master; ++ ++ ASSERT_RTNL(); ++ ++ if (master) { ++ if (old) ++ return -EBUSY; ++ dev_hold(master); ++ } ++ ++ br_write_lock_bh(BR_NETPROTO_LOCK); ++ slave->master = master; ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++ ++ if (old) ++ dev_put(old); ++ ++ if (master) ++ slave->flags |= IFF_SLAVE; ++ else ++ slave->flags &= ~IFF_SLAVE; ++ ++ rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); ++ return 0; ++} ++ ++/** ++ * dev_set_promiscuity - update promiscuity count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove promsicuity from a device. While the count in the device ++ * remains above zero the interface remains promiscuous. Once it hits zero ++ * the device reverts back to normal filtering operation. A negative inc ++ * value is used to drop promiscuity on the device. ++ */ ++ ++void dev_set_promiscuity(struct net_device *dev, int inc) ++{ ++ unsigned short old_flags = dev->flags; ++ ++ dev->flags |= IFF_PROMISC; ++ if ((dev->promiscuity += inc) == 0) ++ dev->flags &= ~IFF_PROMISC; ++ if (dev->flags^old_flags) { ++#ifdef CONFIG_NET_FASTROUTE ++ if (dev->flags&IFF_PROMISC) { ++ netdev_fastroute_obstacles++; ++ dev_clear_fastroute(dev); ++ } else ++ netdev_fastroute_obstacles--; ++#endif ++ dev_mc_upload(dev); ++ printk(KERN_INFO "device %s %s promiscuous mode\n", ++ dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left"); ++ } ++} ++ ++/** ++ * dev_set_allmulti - update allmulti count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove reception of all multicast frames to a device. While the ++ * count in the device remains above zero the interface remains listening ++ * to all interfaces. Once it hits zero the device reverts back to normal ++ * filtering operation. A negative @inc value is used to drop the counter ++ * when releasing a resource needing all multicasts. ++ */ ++ ++void dev_set_allmulti(struct net_device *dev, int inc) ++{ ++ unsigned short old_flags = dev->flags; ++ ++ dev->flags |= IFF_ALLMULTI; ++ if ((dev->allmulti += inc) == 0) ++ dev->flags &= ~IFF_ALLMULTI; ++ if (dev->flags^old_flags) ++ dev_mc_upload(dev); ++} ++ ++int dev_change_flags(struct net_device *dev, unsigned flags) ++{ ++ int ret; ++ int old_flags = dev->flags; ++ ++ /* ++ * Set the flags on our device. ++ */ ++ ++ dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC| ++ IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) | ++ (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI)); ++ ++ /* ++ * Load in the correct multicast list now the flags have changed. ++ */ ++ ++ dev_mc_upload(dev); ++ ++ /* ++ * Have we downed the interface. We handle IFF_UP ourselves ++ * according to user attempts to set it, rather than blindly ++ * setting it. ++ */ ++ ++ ret = 0; ++ if ((old_flags^flags)&IFF_UP) /* Bit is different ? */ ++ { ++ ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); ++ ++ if (ret == 0) ++ dev_mc_upload(dev); ++ } ++ ++ if (dev->flags&IFF_UP && ++ ((old_flags^dev->flags)&~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE))) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); ++ ++ if ((flags^dev->gflags)&IFF_PROMISC) { ++ int inc = (flags&IFF_PROMISC) ? +1 : -1; ++ dev->gflags ^= IFF_PROMISC; ++ dev_set_promiscuity(dev, inc); ++ } ++ ++ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI ++ is important. Some (broken) drivers set IFF_PROMISC, when ++ IFF_ALLMULTI is requested not asking us and not reporting. ++ */ ++ if ((flags^dev->gflags)&IFF_ALLMULTI) { ++ int inc = (flags&IFF_ALLMULTI) ? +1 : -1; ++ dev->gflags ^= IFF_ALLMULTI; ++ dev_set_allmulti(dev, inc); ++ } ++ ++ if (old_flags^dev->flags) ++ rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags); ++ ++ return ret; ++} ++ ++/* ++ * Perform the SIOCxIFxxx calls. ++ */ ++ ++static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ++{ ++ struct net_device *dev; ++ int err; ++ ++ if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL) ++ return -ENODEV; ++ ++ switch(cmd) ++ { ++ case SIOCGIFFLAGS: /* Get interface flags */ ++ ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING)) ++ |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI)); ++ if (netif_running(dev) && netif_carrier_ok(dev)) ++ ifr->ifr_flags |= IFF_RUNNING; ++ return 0; ++ ++ case SIOCSIFFLAGS: /* Set interface flags */ ++ return dev_change_flags(dev, ifr->ifr_flags); ++ ++ case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ ++ ifr->ifr_metric = 0; ++ return 0; ++ ++ case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ ++ return -EOPNOTSUPP; ++ ++ case SIOCGIFMTU: /* Get the MTU of a device */ ++ ifr->ifr_mtu = dev->mtu; ++ return 0; ++ ++ case SIOCSIFMTU: /* Set the MTU of a device */ ++ if (ifr->ifr_mtu == dev->mtu) ++ return 0; ++ ++ /* ++ * MTU must be positive. ++ */ ++ ++ if (ifr->ifr_mtu<0) ++ return -EINVAL; ++ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ if (dev->change_mtu) ++ err = dev->change_mtu(dev, ifr->ifr_mtu); ++ else { ++ dev->mtu = ifr->ifr_mtu; ++ err = 0; ++ } ++ if (!err && dev->flags&IFF_UP) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev); ++ return err; ++ ++ case SIOCGIFHWADDR: ++ memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); ++ ifr->ifr_hwaddr.sa_family=dev->type; ++ return 0; ++ ++ case SIOCSIFHWADDR: ++ if (dev->set_mac_address == NULL) ++ return -EOPNOTSUPP; ++ if (ifr->ifr_hwaddr.sa_family!=dev->type) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ err = dev->set_mac_address(dev, &ifr->ifr_hwaddr); ++ if (!err) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); ++ return err; ++ ++ case SIOCSIFHWBROADCAST: ++ if (ifr->ifr_hwaddr.sa_family!=dev->type) ++ return -EINVAL; ++ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN); ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); ++ return 0; ++ ++ case SIOCGIFMAP: ++ ifr->ifr_map.mem_start=dev->mem_start; ++ ifr->ifr_map.mem_end=dev->mem_end; ++ ifr->ifr_map.base_addr=dev->base_addr; ++ ifr->ifr_map.irq=dev->irq; ++ ifr->ifr_map.dma=dev->dma; ++ ifr->ifr_map.port=dev->if_port; ++ return 0; ++ ++ case SIOCSIFMAP: ++ if (dev->set_config) { ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev->set_config(dev,&ifr->ifr_map); ++ } ++ return -EOPNOTSUPP; ++ ++ case SIOCADDMULTI: ++ if (dev->set_multicast_list == NULL || ++ ifr->ifr_hwaddr.sa_family != AF_UNSPEC) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1); ++ return 0; ++ ++ case SIOCDELMULTI: ++ if (dev->set_multicast_list == NULL || ++ ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1); ++ return 0; ++ ++ case SIOCGIFINDEX: ++ ifr->ifr_ifindex = dev->ifindex; ++ return 0; ++ ++ case SIOCGIFTXQLEN: ++ ifr->ifr_qlen = dev->tx_queue_len; ++ return 0; ++ ++ case SIOCSIFTXQLEN: ++ if (ifr->ifr_qlen<0) ++ return -EINVAL; ++ dev->tx_queue_len = ifr->ifr_qlen; ++ return 0; ++ ++ case SIOCSIFNAME: ++ if (dev->flags&IFF_UP) ++ return -EBUSY; ++ /* Check if name contains a wildcard */ ++ if (strchr(ifr->ifr_newname, '%')) { ++ char format[IFNAMSIZ + 1]; ++ int ret; ++ memcpy(format, ifr->ifr_newname, IFNAMSIZ); ++ format[IFNAMSIZ-1] = 0; ++ /* Find a free name based on format. ++ * dev_alloc_name() replaces "%d" with at max ++ * 2 digits, so no name overflow. - Jean II */ ++ ret = dev_alloc_name(dev, format); ++ if (ret < 0) ++ return ret; ++ /* Copy the new name back to caller. */ ++ strncpy(ifr->ifr_newname, dev->name, IFNAMSIZ); ++ } else { ++ if (__dev_get_by_name(ifr->ifr_newname)) ++ return -EEXIST; ++ memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ); ++ dev->name[IFNAMSIZ-1] = 0; ++ } ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); ++ return 0; ++ ++ /* ++ * Unknown or private ioctl ++ */ ++ ++ default: ++ if ((cmd >= SIOCDEVPRIVATE && ++ cmd <= SIOCDEVPRIVATE + 15) || ++ cmd == SIOCBONDENSLAVE || ++ cmd == SIOCBONDRELEASE || ++ cmd == SIOCBONDSETHWADDR || ++ cmd == SIOCBONDSLAVEINFOQUERY || ++ cmd == SIOCBONDINFOQUERY || ++ cmd == SIOCBONDCHANGEACTIVE || ++ cmd == SIOCGMIIPHY || ++ cmd == SIOCGMIIREG || ++ cmd == SIOCSMIIREG || ++ cmd == SIOCWANDEV) { ++ if (dev->do_ioctl) { ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev->do_ioctl(dev, ifr, cmd); ++ } ++ return -EOPNOTSUPP; ++ } ++ ++ } ++ return -EINVAL; ++} ++ ++/* ++ * This function handles all "interface"-type I/O control requests. The actual ++ * 'doing' part of this is dev_ifsioc above. ++ */ ++ ++/** ++ * dev_ioctl - network device ioctl ++ * @cmd: command to issue ++ * @arg: pointer to a struct ifreq in user space ++ * ++ * Issue ioctl functions to devices. This is normally called by the ++ * user space syscall interfaces but can sometimes be useful for ++ * other purposes. The return value is the return from the syscall if ++ * positive or a negative errno code on error. ++ */ ++ ++int dev_ioctl(unsigned int cmd, void *arg) ++{ ++ struct ifreq ifr; ++ int ret; ++ char *colon; ++ ++ /* One special case: SIOCGIFCONF takes ifconf argument ++ and requires shared lock, because it sleeps writing ++ to user space. ++ */ ++ ++ if (cmd == SIOCGIFCONF) { ++ rtnl_shlock(); ++ ret = dev_ifconf((char *) arg); ++ rtnl_shunlock(); ++ return ret; ++ } ++ if (cmd == SIOCGIFNAME) { ++ return dev_ifname((struct ifreq *)arg); ++ } ++ ++ if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) ++ return -EFAULT; ++ ++ ifr.ifr_name[IFNAMSIZ-1] = 0; ++ ++ colon = strchr(ifr.ifr_name, ':'); ++ if (colon) ++ *colon = 0; ++ ++ /* ++ * See which interface the caller is talking about. ++ */ ++ ++ switch(cmd) ++ { ++ /* ++ * These ioctl calls: ++ * - can be done by all. ++ * - atomic and do not require locking. ++ * - return a value ++ */ ++ ++ case SIOCGIFFLAGS: ++ case SIOCGIFMETRIC: ++ case SIOCGIFMTU: ++ case SIOCGIFHWADDR: ++ case SIOCGIFSLAVE: ++ case SIOCGIFMAP: ++ case SIOCGIFINDEX: ++ case SIOCGIFTXQLEN: ++ dev_load(ifr.ifr_name); ++ read_lock(&dev_base_lock); ++ ret = dev_ifsioc(&ifr, cmd); ++ read_unlock(&dev_base_lock); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ } ++ return ret; ++ ++ case SIOCETHTOOL: ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ethtool(&ifr); ++ rtnl_unlock(); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ } ++ return ret; ++ ++ /* ++ * These ioctl calls: ++ * - require superuser power. ++ * - require strict serialization. ++ * - return a value ++ */ ++ ++ case SIOCSIFNAME: ++ case SIOCGMIIPHY: ++ case SIOCGMIIREG: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ dev_probe_lock(); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ dev_probe_unlock(); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ } ++ return ret; ++ ++ /* ++ * These ioctl calls: ++ * - require superuser power. ++ * - require strict serialization. ++ * - do not return a value ++ */ ++ ++ case SIOCSIFFLAGS: ++ case SIOCSIFMETRIC: ++ case SIOCSIFMTU: ++ case SIOCSIFMAP: ++ case SIOCSIFHWADDR: ++ case SIOCSIFSLAVE: ++ case SIOCADDMULTI: ++ case SIOCDELMULTI: ++ case SIOCSIFHWBROADCAST: ++ case SIOCSIFTXQLEN: ++ case SIOCSMIIREG: ++ case SIOCBONDENSLAVE: ++ case SIOCBONDRELEASE: ++ case SIOCBONDSETHWADDR: ++ case SIOCBONDSLAVEINFOQUERY: ++ case SIOCBONDINFOQUERY: ++ case SIOCBONDCHANGEACTIVE: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ dev_probe_lock(); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ dev_probe_unlock(); ++ return ret; ++ ++ case SIOCGIFMEM: ++ /* Get the per device memory space. We can add this but currently ++ do not support it */ ++ case SIOCSIFMEM: ++ /* Set the per device memory buffer space. Not applicable in our case */ ++ case SIOCSIFLINK: ++ return -EINVAL; ++ ++ /* ++ * Unknown or private ioctl. ++ */ ++ ++ default: ++ if (cmd == SIOCWANDEV || ++ (cmd >= SIOCDEVPRIVATE && ++ cmd <= SIOCDEVPRIVATE + 15)) { ++ dev_load(ifr.ifr_name); ++ dev_probe_lock(); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ dev_probe_unlock(); ++ if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ return ret; ++ } ++#ifdef WIRELESS_EXT ++ /* Take care of Wireless Extensions */ ++ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { ++ /* If command is `set a parameter', or ++ * `get the encoding parameters', check if ++ * the user has the right to do it */ ++ if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) { ++ if(!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ } ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ /* Follow me in net/core/wireless.c */ ++ ret = wireless_process_ioctl(&ifr, cmd); ++ rtnl_unlock(); ++ if (!ret && IW_IS_GET(cmd) && ++ copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ return ret; ++ } ++#endif /* WIRELESS_EXT */ ++ return -EINVAL; ++ } ++} ++ ++ ++/** ++ * dev_new_index - allocate an ifindex ++ * ++ * Returns a suitable unique value for a new device interface ++ * number. The caller must hold the rtnl semaphore or the ++ * dev_base_lock to be sure it remains unique. ++ */ ++ ++int dev_new_index(void) ++{ ++ static int ifindex; ++ for (;;) { ++ if (++ifindex <= 0) ++ ifindex=1; ++ if (__dev_get_by_index(ifindex) == NULL) ++ return ifindex; ++ } ++} ++ ++static int dev_boot_phase = 1; ++ ++/** ++ * register_netdevice - register a network device ++ * @dev: device to register ++ * ++ * Take a completed network device structure and add it to the kernel ++ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier ++ * chain. 0 is returned on success. A negative errno code is returned ++ * on a failure to set up the device, or if the name is a duplicate. ++ * ++ * Callers must hold the rtnl semaphore. See the comment at the ++ * end of Space.c for details about the locking. You may want ++ * register_netdev() instead of this. ++ * ++ * BUGS: ++ * The locking appears insufficient to guarantee two parallel registers ++ * will not get the same name. ++ */ ++ ++int net_dev_init(void); ++ ++int register_netdevice(struct net_device *dev) ++{ ++ struct net_device *d, **dp; ++#ifdef CONFIG_NET_DIVERT ++ int ret; ++#endif ++ ++ spin_lock_init(&dev->queue_lock); ++ spin_lock_init(&dev->xmit_lock); ++ dev->xmit_lock_owner = -1; ++#ifdef CONFIG_NET_FASTROUTE ++ dev->fastpath_lock=RW_LOCK_UNLOCKED; ++#endif ++ ++ if (dev_boot_phase) ++ net_dev_init(); ++ ++#ifdef CONFIG_NET_DIVERT ++ ret = alloc_divert_blk(dev); ++ if (ret) ++ return ret; ++#endif /* CONFIG_NET_DIVERT */ ++ ++ dev->iflink = -1; ++ ++ /* Init, if this function is available */ ++ if (dev->init && dev->init(dev) != 0) { ++#ifdef CONFIG_NET_DIVERT ++ free_divert_blk(dev); ++#endif ++ return -EIO; ++ } ++ ++ dev->ifindex = dev_new_index(); ++ if (dev->iflink == -1) ++ dev->iflink = dev->ifindex; ++ ++ /* Check for existence, and append to tail of chain */ ++ for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { ++ if (d == dev || strcmp(d->name, dev->name) == 0) { ++#ifdef CONFIG_NET_DIVERT ++ free_divert_blk(dev); ++#endif ++ return -EEXIST; ++ } ++ } ++ ++ /* Fix illegal SG+CSUM combinations. */ ++ if ((dev->features & NETIF_F_SG) && ++ !(dev->features & (NETIF_F_IP_CSUM | ++ NETIF_F_NO_CSUM | ++ NETIF_F_HW_CSUM))) { ++ printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", ++ dev->name); ++ dev->features &= ~NETIF_F_SG; ++ } ++ ++ /* ++ * nil rebuild_header routine, ++ * that should be never called and used as just bug trap. ++ */ ++ ++ if (dev->rebuild_header == NULL) ++ dev->rebuild_header = default_rebuild_header; ++ ++ /* ++ * Default initial state at registry is that the ++ * device is present. ++ */ ++ ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ ++ dev->next = NULL; ++ dev_init_scheduler(dev); ++ write_lock_bh(&dev_base_lock); ++ *dp = dev; ++ dev_hold(dev); ++ dev->deadbeaf = 0; ++ write_unlock_bh(&dev_base_lock); ++ ++ /* Notify protocols, that a new device appeared. */ ++ notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); ++ ++ net_run_sbin_hotplug(dev, "register"); ++ ++ return 0; ++} ++ ++/** ++ * netdev_finish_unregister - complete unregistration ++ * @dev: device ++ * ++ * Destroy and free a dead device. A value of zero is returned on ++ * success. ++ */ ++ ++int netdev_finish_unregister(struct net_device *dev) ++{ ++ BUG_TRAP(dev->ip_ptr==NULL); ++ BUG_TRAP(dev->ip6_ptr==NULL); ++ BUG_TRAP(dev->dn_ptr==NULL); ++ ++ if (!dev->deadbeaf) { ++ printk(KERN_ERR "Freeing alive device %p, %s\n", dev, dev->name); ++ return 0; ++ } ++#ifdef NET_REFCNT_DEBUG ++ printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name, ++ (dev->features & NETIF_F_DYNALLOC)?"":", old style"); ++#endif ++ if (dev->destructor) ++ dev->destructor(dev); ++ if (dev->features & NETIF_F_DYNALLOC) ++ kfree(dev); ++ return 0; ++} ++ ++/** ++ * unregister_netdevice - remove device from the kernel ++ * @dev: device ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. On success 0 is returned, on a failure ++ * a negative errno code is returned. ++ * ++ * Callers must hold the rtnl semaphore. See the comment at the ++ * end of Space.c for details about the locking. You may want ++ * unregister_netdev() instead of this. ++ */ ++ ++int unregister_netdevice(struct net_device *dev) ++{ ++ unsigned long now, warning_time; ++ struct net_device *d, **dp; ++ ++ /* If device is running, close it first. */ ++ if (dev->flags & IFF_UP) ++ dev_close(dev); ++ ++ BUG_TRAP(dev->deadbeaf==0); ++ dev->deadbeaf = 1; ++ ++ /* And unlink it from device chain. */ ++ for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { ++ if (d == dev) { ++ write_lock_bh(&dev_base_lock); ++ *dp = d->next; ++ write_unlock_bh(&dev_base_lock); ++ break; ++ } ++ } ++ if (d == NULL) { ++ printk(KERN_DEBUG "unregister_netdevice: device %s/%p never was registered\n", dev->name, dev); ++ return -ENODEV; ++ } ++ ++ /* Synchronize to net_rx_action. */ ++ br_write_lock_bh(BR_NETPROTO_LOCK); ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++ ++ if (dev_boot_phase == 0) { ++#ifdef CONFIG_NET_FASTROUTE ++ dev_clear_fastroute(dev); ++#endif ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ net_run_sbin_hotplug(dev, "unregister"); ++ ++ /* Notify protocols, that we are about to destroy ++ this device. They should clean all the things. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); ++ ++ /* ++ * Flush the multicast chain ++ */ ++ dev_mc_discard(dev); ++ } ++ ++ if (dev->uninit) ++ dev->uninit(dev); ++ ++ /* Notifier chain MUST detach us from master device. */ ++ BUG_TRAP(dev->master==NULL); ++ ++#ifdef CONFIG_NET_DIVERT ++ free_divert_blk(dev); ++#endif ++ ++ if (dev->features & NETIF_F_DYNALLOC) { ++#ifdef NET_REFCNT_DEBUG ++ if (atomic_read(&dev->refcnt) != 1) ++ printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n", dev->name, atomic_read(&dev->refcnt)-1); ++#endif ++ dev_put(dev); ++ return 0; ++ } ++ ++ /* Last reference is our one */ ++ if (atomic_read(&dev->refcnt) == 1) { ++ dev_put(dev); ++ return 0; ++ } ++ ++#ifdef NET_REFCNT_DEBUG ++ printk("unregister_netdevice: waiting %s refcnt=%d\n", dev->name, atomic_read(&dev->refcnt)); ++#endif ++ ++ /* EXPLANATION. If dev->refcnt is not now 1 (our own reference) ++ it means that someone in the kernel still has a reference ++ to this device and we cannot release it. ++ ++ "New style" devices have destructors, hence we can return from this ++ function and destructor will do all the work later. As of kernel 2.4.0 ++ there are very few "New Style" devices. ++ ++ "Old style" devices expect that the device is free of any references ++ upon exit from this function. ++ We cannot return from this function until all such references have ++ fallen away. This is because the caller of this function will probably ++ immediately kfree(*dev) and then be unloaded via sys_delete_module. ++ ++ So, we linger until all references fall away. The duration of the ++ linger is basically unbounded! It is driven by, for example, the ++ current setting of sysctl_ipfrag_time. ++ ++ After 1 second, we start to rebroadcast unregister notifications ++ in hope that careless clients will release the device. ++ ++ */ ++ ++ now = warning_time = jiffies; ++ while (atomic_read(&dev->refcnt) != 1) { ++ if ((jiffies - now) > 1*HZ) { ++ /* Rebroadcast unregister notification */ ++ notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); ++ } ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(HZ/4); ++ current->state = TASK_RUNNING; ++ if ((jiffies - warning_time) > 10*HZ) { ++ printk(KERN_EMERG "unregister_netdevice: waiting for %s to " ++ "become free. Usage count = %d\n", ++ dev->name, atomic_read(&dev->refcnt)); ++ warning_time = jiffies; ++ } ++ } ++ dev_put(dev); ++ return 0; ++} ++ ++ ++/* ++ * Initialize the DEV module. At boot time this walks the device list and ++ * unhooks any devices that fail to initialise (normally hardware not ++ * present) and leaves us with a valid list of present and active devices. ++ * ++ */ ++ ++extern void net_device_init(void); ++extern void ip_auto_config(void); ++struct proc_dir_entry *proc_net_drivers; ++#ifdef CONFIG_NET_DIVERT ++extern void dv_init(void); ++#endif /* CONFIG_NET_DIVERT */ ++ ++ ++/* ++ * Callers must hold the rtnl semaphore. See the comment at the ++ * end of Space.c for details about the locking. ++ */ ++int __init net_dev_init(void) ++{ ++ struct net_device *dev, **dp; ++ int i; ++ ++ if (!dev_boot_phase) ++ return 0; ++ ++ ++#ifdef CONFIG_NET_DIVERT ++ dv_init(); ++#endif /* CONFIG_NET_DIVERT */ ++ ++ /* ++ * Initialise the packet receive queues. ++ */ ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ struct softnet_data *queue; ++ ++ queue = &softnet_data[i]; ++ skb_queue_head_init(&queue->input_pkt_queue); ++ queue->throttle = 0; ++ queue->cng_level = 0; ++ queue->avg_blog = 10; /* arbitrary non-zero */ ++ queue->completion_queue = NULL; ++ INIT_LIST_HEAD(&queue->poll_list); ++ set_bit(__LINK_STATE_START, &queue->blog_dev.state); ++ queue->blog_dev.weight = weight_p; ++ queue->blog_dev.poll = process_backlog; ++ atomic_set(&queue->blog_dev.refcnt, 1); ++ } ++ ++#ifdef CONFIG_NET_PROFILE ++ net_profile_init(); ++ NET_PROFILE_REGISTER(dev_queue_xmit); ++ NET_PROFILE_REGISTER(softnet_process); ++#endif ++ ++#ifdef OFFLINE_SAMPLE ++ samp_timer.expires = jiffies + (10 * HZ); ++ add_timer(&samp_timer); ++#endif ++ ++ /* ++ * Add the devices. ++ * If the call to dev->init fails, the dev is removed ++ * from the chain disconnecting the device until the ++ * next reboot. ++ * ++ * NB At boot phase networking is dead. No locking is required. ++ * But we still preserve dev_base_lock for sanity. ++ */ ++ ++ dp = &dev_base; ++ while ((dev = *dp) != NULL) { ++ spin_lock_init(&dev->queue_lock); ++ spin_lock_init(&dev->xmit_lock); ++#ifdef CONFIG_NET_FASTROUTE ++ dev->fastpath_lock = RW_LOCK_UNLOCKED; ++#endif ++ dev->xmit_lock_owner = -1; ++ dev->iflink = -1; ++ dev_hold(dev); ++ ++ /* ++ * Allocate name. If the init() fails ++ * the name will be reissued correctly. ++ */ ++ if (strchr(dev->name, '%')) ++ dev_alloc_name(dev, dev->name); ++ ++ /* ++ * Check boot time settings for the device. ++ */ ++ netdev_boot_setup_check(dev); ++ ++ if (dev->init && dev->init(dev)) { ++ /* ++ * It failed to come up. It will be unhooked later. ++ * dev_alloc_name can now advance to next suitable ++ * name that is checked next. ++ */ ++ dev->deadbeaf = 1; ++ dp = &dev->next; ++ } else { ++ dp = &dev->next; ++ dev->ifindex = dev_new_index(); ++ if (dev->iflink == -1) ++ dev->iflink = dev->ifindex; ++ if (dev->rebuild_header == NULL) ++ dev->rebuild_header = default_rebuild_header; ++ dev_init_scheduler(dev); ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ } ++ } ++ ++ /* ++ * Unhook devices that failed to come up ++ */ ++ dp = &dev_base; ++ while ((dev = *dp) != NULL) { ++ if (dev->deadbeaf) { ++ write_lock_bh(&dev_base_lock); ++ *dp = dev->next; ++ write_unlock_bh(&dev_base_lock); ++ dev_put(dev); ++ } else { ++ dp = &dev->next; ++ } ++ } ++ ++#ifdef CONFIG_PROC_FS ++ proc_net_create("dev", 0, dev_get_info); ++ create_proc_read_entry("net/softnet_stat", 0, 0, dev_proc_stats, NULL); ++ proc_net_drivers = proc_mkdir("net/drivers", 0); ++#ifdef WIRELESS_EXT ++ /* Available in net/core/wireless.c */ ++ proc_net_create("wireless", 0, dev_get_wireless_info); ++#endif /* WIRELESS_EXT */ ++#endif /* CONFIG_PROC_FS */ ++ ++ dev_boot_phase = 0; ++ ++ open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); ++ open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); ++ ++ dst_init(); ++ dev_mcast_init(); ++ ++#ifdef CONFIG_NET_SCHED ++ pktsched_init(); ++#endif ++ /* ++ * Initialise network devices ++ */ ++ ++ net_device_init(); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG ++ ++/* Notify userspace when a netdevice event occurs, ++ * by running '/sbin/hotplug net' with certain ++ * environment variables set. ++ */ ++ ++static int net_run_sbin_hotplug(struct net_device *dev, char *action) ++{ ++ char *argv[3], *envp[5], ifname[12 + IFNAMSIZ], action_str[32]; ++ int i; ++ ++ sprintf(ifname, "INTERFACE=%s", dev->name); ++ sprintf(action_str, "ACTION=%s", action); ++ ++ i = 0; ++ argv[i++] = hotplug_path; ++ argv[i++] = "net"; ++ argv[i] = 0; ++ ++ i = 0; ++ /* minimal command environment */ ++ envp [i++] = "HOME=/"; ++ envp [i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; ++ envp [i++] = ifname; ++ envp [i++] = action_str; ++ envp [i] = 0; ++ ++ return call_usermodehelper(argv [0], argv, envp); ++} ++#endif +diff --unified --recursive --new-file linux-2.4.30/net/netsyms.c linux-2.4.30-1-686-smp-ring3/net/netsyms.c +--- linux-2.4.30/net/netsyms.c 2005-04-04 03:42:20.000000000 +0200 ++++ linux-2.4.30-1-686-smp-ring3/net/netsyms.c 2005-10-22 23:08:28.016050500 +0200 +@@ -628,3 +628,18 @@ + #endif /* CONFIG_NET_RADIO || CONFIG_NET_PCMCIA_RADIO */ + + #endif /* CONFIG_NET */ ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++#include ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++#include ++ ++EXPORT_SYMBOL(get_skb_ring_handler); ++EXPORT_SYMBOL(set_skb_ring_handler); ++EXPORT_SYMBOL(do_skb_ring_handler); ++EXPORT_SYMBOL(get_buffer_ring_handler); ++EXPORT_SYMBOL(set_buffer_ring_handler); ++EXPORT_SYMBOL(do_buffer_ring_handler); ++#endif ++ ++#endif +diff --unified --recursive --new-file linux-2.4.30/net/ring/Config.in linux-2.4.30-1-686-smp-ring3/net/ring/Config.in +--- linux-2.4.30/net/ring/Config.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/ring/Config.in 2005-10-22 23:08:28.048052500 +0200 +@@ -0,0 +1,4 @@ ++# ++# PF_RING ++# ++tristate ' PF_RING (EXPERIMENTAL)' CONFIG_RING +diff --unified --recursive --new-file linux-2.4.30/net/ring/Makefile linux-2.4.30-1-686-smp-ring3/net/ring/Makefile +--- linux-2.4.30/net/ring/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/ring/Makefile 2005-10-22 23:08:27.420013250 +0200 +@@ -0,0 +1,16 @@ ++# ++# Makefile for the ring driver. ++# ++ ++O_TARGET := ring.o ++ ++export-objs := ring_packet.o ++ ++obj-y := ring_packet.o ++ ++ifeq ($(CONFIG_RING),m) ++ obj-m += $(O_TARGET) ++endif ++ ++include $(TOPDIR)/Rules.make ++ +diff --unified --recursive --new-file linux-2.4.30/net/ring/ring_packet.c linux-2.4.30-1-686-smp-ring3/net/ring/ring_packet.c +--- linux-2.4.30/net/ring/ring_packet.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/ring/ring_packet.c 2005-10-22 23:08:27.440014500 +0200 +@@ -0,0 +1,1623 @@ ++/* ++ * ++ * (C) 2004-05 - Luca Deri ++ * ++ * This code includes patches courtesy of ++ * - Jeff Randall ++ * - Helmut Manck ++ * - Brad Doctor ++ * ++ */ ++ ++/* FIX: add an entry inside the /proc filesystem */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#include ++#else ++#include ++#endif ++#include ++#include /* needed for virt_to_phys() */ ++ ++/* #define RING_DEBUG */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)) ++static inline int remap_page_range(struct vm_area_struct *vma, ++ unsigned long uvaddr, ++ unsigned long paddr, ++ unsigned long size, ++ pgprot_t prot) { ++ return(remap_pfn_range(vma, uvaddr, paddr >> PAGE_SHIFT, ++ size, prot)); ++} ++#endif ++ ++/* ************************************************* */ ++ ++#define CLUSTER_LEN 8 ++ ++struct ring_cluster { ++ u_short cluster_id; /* 0 = no cluster */ ++ u_short num_cluster_elements; ++ enum cluster_type hashing_mode; ++ u_short hashing_id; ++ struct sock *sk[CLUSTER_LEN]; ++ struct ring_cluster *next; /* NULL = last element of the cluster */ ++}; ++ ++/* ************************************************* */ ++ ++struct ring_element { ++ struct list_head list; ++ struct sock *sk; ++}; ++ ++/* ************************************************* */ ++ ++struct ring_opt { ++ struct net_device *ring_netdev; ++ ++ /* Cluster */ ++ u_short cluster_id; /* 0 = no cluster */ ++ ++ /* Reflector */ ++ struct net_device *reflector_dev; ++ ++ /* Packet buffers */ ++ unsigned long order; ++ ++ /* Ring Slots */ ++ unsigned long ring_memory; ++ FlowSlotInfo *slots_info; /* Basically it points to ring_memory */ ++ char *ring_slots; /* Basically it points to ring_memory ++ +sizeof(FlowSlotInfo) */ ++ ++ /* Packet Sampling */ ++ u_int pktToSample, sample_rate; ++ ++ /* BPF Filter */ ++ struct sk_filter *bpfFilter; ++ ++ /* Locks */ ++ atomic_t num_ring_slots_waiters; ++ wait_queue_head_t ring_slots_waitqueue; ++ rwlock_t ring_index_lock; ++ ++ /* Indexes (Internal) */ ++ u_int insert_page_id, insert_slot_id; ++}; ++ ++/* ************************************************* */ ++ ++/* List of all ring sockets. */ ++static struct list_head ring_table; ++ ++/* List of all clusters */ ++static struct ring_cluster *ring_cluster_list; ++ ++static rwlock_t ring_mgmt_lock = RW_LOCK_UNLOCKED; ++ ++/* ********************************** */ ++ ++/* Forward */ ++static struct proto_ops ring_ops; ++ ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++static struct proto ring_proto; ++#endif ++ ++static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet, ++ u_char real_skb); ++static int buffer_ring_handler(struct net_device *dev, char *data, int len); ++static int remove_from_cluster(struct sock *sock, struct ring_opt *pfr); ++ ++/* Extern */ ++ ++/* ********************************** */ ++ ++/* Defaults */ ++static u_int bucket_len = 128, num_slots = 4096, sample_rate = 1, ++ transparent_mode = 0, enable_tx_capture = 0; ++ ++MODULE_PARM(bucket_len, "i"); ++MODULE_PARM_DESC(bucket_len, "Number of ring buckets"); ++MODULE_PARM(num_slots, "i"); ++MODULE_PARM_DESC(num_slots, "Number of ring slots"); ++MODULE_PARM(sample_rate, "i"); ++MODULE_PARM_DESC(sample_rate, "Ring packet sample rate"); ++MODULE_PARM(transparent_mode, "i"); ++MODULE_PARM_DESC(transparent_mode, ++ "Set to 1 to set transparent mode " ++ "(slower but backwards compatible)"); ++MODULE_PARM(enable_tx_capture, "i"); ++MODULE_PARM_DESC(enable_tx_capture, "Set to 1 to capture outgoing packets"); ++ ++/* ********************************** */ ++ ++#define MIN_QUEUED_PKTS 64 ++#define MAX_QUEUE_LOOPS 64 ++ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#define ring_sk_datatype(__sk) ((struct ring_opt *)__sk) ++#define ring_sk(__sk) ((__sk)->sk_protinfo) ++#else ++#define ring_sk_datatype(a) (a) ++#define ring_sk(__sk) ((__sk)->protinfo.pf_ring) ++#endif ++ ++#define _rdtsc() ({ uint64_t x; asm volatile("rdtsc" : "=A" (x)); x; }) ++ ++/* ++ int dev_queue_xmit(struct sk_buff *skb) ++ skb->dev; ++ struct net_device *dev_get_by_name(const char *name) ++*/ ++ ++/* ********************************** */ ++ ++static void ring_sock_destruct(struct sock *sk) { ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ skb_queue_purge(&sk->sk_receive_queue); ++ ++ if (!sock_flag(sk, SOCK_DEAD)) { ++#if defined(RING_DEBUG) ++ printk("Attempt to release alive ring socket: %p\n", sk); ++#endif ++ return; ++ } ++ ++ BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); ++ BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); ++#else ++ ++ BUG_TRAP(atomic_read(&sk->rmem_alloc)==0); ++ BUG_TRAP(atomic_read(&sk->wmem_alloc)==0); ++ ++ if (!sk->dead) { ++#if defined(RING_DEBUG) ++ printk("Attempt to release alive ring socket: %p\n", sk); ++#endif ++ return; ++ } ++#endif ++ ++ kfree(ring_sk(sk)); ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_DEC_USE_COUNT; ++#endif ++} ++ ++/* ********************************** */ ++/* ++ * ring_insert() ++ * ++ * store the sk in a new element and add it ++ * to the head of the list. ++ */ ++static inline void ring_insert(struct sock *sk) { ++ struct ring_element *next; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_insert()\n"); ++#endif ++ ++ next = kmalloc(sizeof(struct ring_element), GFP_ATOMIC); ++ if(next != NULL) { ++ next->sk = sk; ++ write_lock_irq(&ring_mgmt_lock); ++ list_add(&next->list, &ring_table); ++ write_unlock_irq(&ring_mgmt_lock); ++ } else { ++ if (net_ratelimit()) ++ printk("RING: could not kmalloc slot!!\n"); ++ } ++} ++ ++/* ********************************** */ ++/* ++ * ring_remove() ++ * ++ * For each of the elements in the list: ++ * - check if this is the element we want to delete ++ * - if it is, remove it from the list, and free it. ++ * ++ * stop when we find the one we're looking for (break), ++ * or when we reach the end of the list. ++ */ ++static inline void ring_remove(struct sock *sk) { ++ struct list_head *ptr; ++ struct ring_element *entry; ++ ++ ++ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ entry = list_entry(ptr, struct ring_element, list); ++ ++ if(entry->sk == sk) { ++ write_lock_irq(&ring_mgmt_lock); ++ list_del(ptr); ++ kfree(ptr); ++ write_unlock_irq(&ring_mgmt_lock); ++ break; ++ } ++ } ++ ++} ++ ++/* ********************************** */ ++ ++static u_int32_t num_queued_pkts(struct ring_opt *pfr) { ++ ++ if(pfr->ring_slots != NULL) { ++ ++ u_int32_t tot_insert = pfr->slots_info->insert_idx, ++#if defined(RING_DEBUG) ++ tot_read = pfr->slots_info->tot_read, tot_pkts; ++#else ++ tot_read = pfr->slots_info->tot_read; ++#endif ++ ++ if(tot_insert >= tot_read) { ++#if defined(RING_DEBUG) ++ tot_pkts = tot_insert-tot_read; ++#endif ++ return(tot_insert-tot_read); ++ } else { ++#if defined(RING_DEBUG) ++ tot_pkts = ((u_int32_t)-1)+tot_insert-tot_read; ++#endif ++ return(((u_int32_t)-1)+tot_insert-tot_read); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("-> num_queued_pkts=%d [tot_insert=%d][tot_read=%d]\n", ++ tot_pkts, tot_insert, tot_read); ++#endif ++ ++ } else ++ return(0); ++} ++ ++/* ********************************** */ ++ ++static inline FlowSlot* get_insert_slot(struct ring_opt *pfr) { ++#if defined(RING_DEBUG) ++ printk("get_insert_slot(%d)\n", pfr->slots_info->insert_idx); ++#endif ++ ++ if(pfr->ring_slots != NULL) { ++ FlowSlot *slot = (FlowSlot*)&(pfr->ring_slots[pfr->slots_info->insert_idx ++ *pfr->slots_info->slot_len]); ++ return(slot); ++ } else ++ return(NULL); ++} ++ ++/* ********************************** */ ++ ++static inline FlowSlot* get_remove_slot(struct ring_opt *pfr) { ++#if defined(RING_DEBUG) ++ printk("get_remove_slot(%d)\n", pfr->slots_info->remove_idx); ++#endif ++ ++ if(pfr->ring_slots != NULL) ++ return((FlowSlot*)&(pfr->ring_slots[pfr->slots_info->remove_idx* ++ pfr->slots_info->slot_len])); ++ else ++ return(NULL); ++} ++ ++/* ********************************** */ ++ ++static void add_skb_to_ring(struct sk_buff *skb, ++ struct ring_opt *pfr, ++ u_char recv_packet, ++ u_char real_skb /* 1=skb 0=faked skb */) { ++ FlowSlot *theSlot; ++ int idx, displ; ++ ++ if(recv_packet) { ++ /* Hack for identifying a packet received by the e1000 */ ++ if(real_skb) { ++ displ = SKB_DISPLACEMENT; ++ } else ++ displ = 0; /* Received by the e1000 wrapper */ ++ } else ++ displ = 0; ++ ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_pkts++; ++ write_unlock(&pfr->ring_index_lock); ++ ++ /* BPF Filtering (from af_packet.c) */ ++ if(pfr->bpfFilter != NULL) { ++ unsigned res = 1, len; ++ ++ len = skb->len-skb->data_len; ++ ++ write_lock(&pfr->ring_index_lock); ++ skb->data -= displ; ++ res = sk_run_filter(skb, pfr->bpfFilter->insns, pfr->bpfFilter->len); ++ skb->data += displ; ++ write_unlock(&pfr->ring_index_lock); ++ ++ if(res == 0) { ++ /* Filter failed */ ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]" ++ "[insertIdx=%d][pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ ++ return; ++ } ++ } ++ ++ /* ************************** */ ++ ++ if(pfr->sample_rate > 1) { ++ if(pfr->pktToSample == 0) { ++ write_lock(&pfr->ring_index_lock); ++ pfr->pktToSample = pfr->sample_rate; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->pktToSample--; ++ write_unlock(&pfr->ring_index_lock); ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): sampled packet [len=%d]" ++ "[tot=%llu][insertIdx=%d][pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ return; ++ } ++ } ++ ++ /* ************************************* */ ++ ++ if((pfr->reflector_dev != NULL) ++ && (!netif_queue_stopped(pfr->reflector_dev))) { ++ int cpu = smp_processor_id(); ++ ++ /* increase reference counter so that this skb is not freed */ ++ atomic_inc(&skb->users); ++ ++ skb->data -= displ; ++ ++ /* send it */ ++ if (pfr->reflector_dev->xmit_lock_owner != cpu) { ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = cpu; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++ ++ if (pfr->reflector_dev->hard_start_xmit(skb, ++ pfr->reflector_dev) == 0) { ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = -1; ++ skb->data += displ; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++#if defined(RING_DEBUG) ++ printk("++ hard_start_xmit succeeded\n"); ++#endif ++ return; /* OK */ ++ } ++ ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = -1; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("++ hard_start_xmit failed\n"); ++#endif ++ skb->data += displ; ++ return; /* -ENETDOWN */ ++ } ++ ++ /* ************************************* */ ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb) [len=%d][tot=%llu][insertIdx=%d]" ++ "[pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ ++ idx = pfr->slots_info->insert_idx; ++ theSlot = get_insert_slot(pfr); ++ ++ if((theSlot != NULL) && (theSlot->slot_state == 0)) { ++ struct pcap_pkthdr *hdr; ++ unsigned int bucketSpace; ++ char *bucket; ++ ++ /* Update Index */ ++ idx++; ++ ++ if(idx == pfr->slots_info->tot_slots) { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->insert_idx = 0; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->insert_idx = idx; ++ write_unlock(&pfr->ring_index_lock); ++ } ++ ++ bucketSpace = pfr->slots_info->slot_len ++#ifdef RING_MAGIC ++ - sizeof(u_char) ++#endif ++ - sizeof(u_char) /* flowSlot.slot_state */ ++ - sizeof(struct pcap_pkthdr) ++ - 1 /* 10 */ /* safe boundary */; ++ ++ bucket = &theSlot->bucket; ++ hdr = (struct pcap_pkthdr*)bucket; ++ ++ if(skb->stamp.tv_sec == 0) do_gettimeofday(&skb->stamp); ++ ++ hdr->ts.tv_sec = skb->stamp.tv_sec, hdr->ts.tv_usec = skb->stamp.tv_usec; ++ hdr->caplen = skb->len+displ; ++ ++ if(hdr->caplen > bucketSpace) ++ hdr->caplen = bucketSpace; ++ ++ hdr->len = skb->len+displ; ++ memcpy(&bucket[sizeof(struct pcap_pkthdr)], ++ skb->data-displ, hdr->caplen); ++ ++#if defined(RING_DEBUG) ++ { ++ static unsigned int lastLoss = 0; ++ ++ if(pfr->slots_info->tot_lost ++ && (lastLoss != pfr->slots_info->tot_lost)) { ++ printk("add_skb_to_ring(%d): [bucketSpace=%d]" ++ "[hdr.caplen=%d][skb->len=%d]" ++ "[pcap_pkthdr=%d][removeIdx=%d]" ++ "[loss=%lu][page=%u][slot=%u]\n", ++ idx-1, bucketSpace, hdr->caplen, skb->len, ++ sizeof(struct pcap_pkthdr), ++ pfr->slots_info->remove_idx, ++ (long unsigned int)pfr->slots_info->tot_lost, ++ pfr->insert_page_id, pfr->insert_slot_id); ++ ++ lastLoss = pfr->slots_info->tot_lost; ++ } ++ } ++#endif ++ ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_insert++; ++ theSlot->slot_state = 1; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_lost++; ++ write_unlock(&pfr->ring_index_lock); ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): packet lost [loss=%lu]" ++ "[removeIdx=%u][insertIdx=%u]\n", ++ (long unsigned int)pfr->slots_info->tot_lost, ++ pfr->slots_info->remove_idx, pfr->slots_info->insert_idx); ++#endif ++ } ++ ++ /* wakeup in case of poll() */ ++ if(waitqueue_active(&pfr->ring_slots_waitqueue)) ++ wake_up_interruptible(&pfr->ring_slots_waitqueue); ++} ++ ++/* ********************************** */ ++ ++static u_int hash_skb(struct ring_cluster *cluster_ptr, ++ struct sk_buff *skb, u_char recv_packet) { ++ u_int idx; ++ int displ; ++ struct iphdr *ip; ++ ++ if(cluster_ptr->hashing_mode == cluster_round_robin) { ++ idx = cluster_ptr->hashing_id++; ++ } else { ++ /* Per-flow clustering */ ++ if(skb->len > sizeof(struct iphdr)+sizeof(struct tcphdr)) { ++ if(recv_packet) ++ displ = 0; ++ else ++ displ = SKB_DISPLACEMENT; ++ ++ /* ++ skb->data+displ ++ ++ Always points to to the IP part of the packet ++ */ ++ ++ ip = (struct iphdr*)(skb->data+displ); ++ ++ idx = ip->saddr+ip->daddr+ip->protocol; ++ ++ if(ip->protocol == IPPROTO_TCP) { ++ struct tcphdr *tcp = (struct tcphdr*)(skb->data+displ ++ +sizeof(struct iphdr)); ++ idx += tcp->source+tcp->dest; ++ } else if(ip->protocol == IPPROTO_UDP) { ++ struct udphdr *udp = (struct udphdr*)(skb->data+displ ++ +sizeof(struct iphdr)); ++ idx += udp->source+udp->dest; ++ } ++ } else ++ idx = skb->len; ++ } ++ ++ return(idx % cluster_ptr->num_cluster_elements); ++} ++ ++/* ********************************** */ ++ ++static int skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, ++ u_char real_skb /* 1=skb 0=faked skb */) { ++ struct sock *skElement; ++ int rc = 0; ++ struct list_head *ptr; ++ struct ring_cluster *cluster_ptr; ++ ++#ifdef PROFILING ++ uint64_t rdt = _rdtsc(), rdt1, rdt2; ++#endif ++ ++ if((!skb) /* Invalid skb */ ++ || ((!enable_tx_capture) && (!recv_packet))) { ++ /* ++ An outgoing packet is about to be sent out ++ but we decided not to handle transmitted ++ packets. ++ */ ++ return(0); ++ } ++ ++#if defined(RING_DEBUG) ++ if(0) { ++ printk("skb_ring_handler() [len=%d][dev=%s]\n", skb->len, ++ skb->dev->name == NULL ? "" : skb->dev->name); ++ } ++#endif ++ ++#ifdef PROFILING ++ rdt1 = _rdtsc(); ++#endif ++ ++ /* [1] Check unclustered sockets */ ++ for (ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ struct ring_opt *pfr; ++ struct ring_element *entry; ++ ++ entry = list_entry(ptr, struct ring_element, list); ++ ++ read_lock(&ring_mgmt_lock); ++ skElement = entry->sk; ++ pfr = ring_sk(skElement); ++ read_unlock(&ring_mgmt_lock); ++ ++ if((pfr != NULL) ++ && (pfr->cluster_id == 0 /* No cluster */) ++ && (pfr->ring_slots != NULL) ++ && (pfr->ring_netdev == skb->dev)) { ++ /* We've found the ring where the packet can be stored */ ++ read_lock(&ring_mgmt_lock); ++ add_skb_to_ring(skb, pfr, recv_packet, real_skb); ++ read_unlock(&ring_mgmt_lock); ++ ++ rc = 1; /* Ring found: we've done our job */ ++ } ++ } ++ ++ /* [2] Check socket clusters */ ++ cluster_ptr = ring_cluster_list; ++ ++ while(cluster_ptr != NULL) { ++ struct ring_opt *pfr; ++ ++ if(cluster_ptr->num_cluster_elements > 0) { ++ u_int skb_hash = hash_skb(cluster_ptr, skb, recv_packet); ++ ++ read_lock(&ring_mgmt_lock); ++ skElement = cluster_ptr->sk[skb_hash]; ++ read_unlock(&ring_mgmt_lock); ++ ++ if(skElement != NULL) { ++ pfr = ring_sk(skElement); ++ ++ if((pfr != NULL) ++ && (pfr->ring_slots != NULL) ++ && (pfr->ring_netdev == skb->dev)) { ++ /* We've found the ring where the packet can be stored */ ++ read_lock(&ring_mgmt_lock); ++ add_skb_to_ring(skb, pfr, recv_packet, real_skb); ++ read_unlock(&ring_mgmt_lock); ++ ++ rc = 1; /* Ring found: we've done our job */ ++ } ++ } ++ } ++ ++ cluster_ptr = cluster_ptr->next; ++ } ++ ++#ifdef PROFILING ++ rdt1 = _rdtsc()-rdt1; ++#endif ++ ++#ifdef PROFILING ++ rdt2 = _rdtsc(); ++#endif ++ ++ if(transparent_mode) rc = 0; ++ ++ if((rc != 0) && real_skb) ++ dev_kfree_skb(skb); /* Free the skb */ ++ ++#ifdef PROFILING ++ rdt2 = _rdtsc()-rdt2; ++ rdt = _rdtsc()-rdt; ++ ++#if defined(RING_DEBUG) ++ printk("# cycles: %d [lock costed %d %d%%][free costed %d %d%%]\n", ++ (int)rdt, rdt-rdt1, ++ (int)((float)((rdt-rdt1)*100)/(float)rdt), ++ rdt2, ++ (int)((float)(rdt2*100)/(float)rdt)); ++#endif ++#endif ++ ++ return(rc); /* 0 = packet not handled */ ++} ++ ++/* ********************************** */ ++ ++struct sk_buff skb; ++ ++static int buffer_ring_handler(struct net_device *dev, ++ char *data, int len) { ++ ++#if defined(RING_DEBUG) ++ printk("buffer_ring_handler: [dev=%s][len=%d]\n", ++ dev->name == NULL ? "" : dev->name, len); ++#endif ++ ++ skb.dev = dev, skb.len = len, skb.data = data, ++ skb.data_len = len, skb.stamp.tv_sec = 0; /* Calculate the time */ ++ ++ skb_ring_handler(&skb, 1, 0 /* fake skb */); ++ ++ return(0); ++} ++ ++/* ********************************** */ ++ ++static int ring_create(struct socket *sock, int protocol) { ++ struct sock *sk; ++ struct ring_opt *pfr; ++ int err; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_create()\n"); ++#endif ++ ++ /* Are you root, superuser or so ? */ ++ if(!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ if(sock->type != SOCK_RAW) ++ return -ESOCKTNOSUPPORT; ++ ++ if(protocol != htons(ETH_P_ALL)) ++ return -EPROTONOSUPPORT; ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_INC_USE_COUNT; ++#endif ++ ++ err = -ENOMEM; ++ ++ // BD: -- broke this out to keep it more simple and clear as to what the ++ // options are. ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) ++ sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL); ++#endif ++#endif ++ ++ // BD: API changed in 2.6.12, ref: ++ // http://svn.clkao.org/svnweb/linux/revision/?rev=28201 ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++ sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1); ++#endif ++ ++ if (sk == NULL) ++ goto out; ++ ++ sock->ops = &ring_ops; ++ sock_init_data(sock, sk); ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) ++ sk_set_owner(sk, THIS_MODULE); ++#endif ++#endif ++ ++ err = -ENOMEM; ++ ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL)); ++ ++ if (!(pfr = ring_sk(sk))) { ++ sk_free(sk); ++ goto out; ++ } ++ memset(pfr, 0, sizeof(*pfr)); ++ init_waitqueue_head(&pfr->ring_slots_waitqueue); ++ pfr->ring_index_lock = RW_LOCK_UNLOCKED; ++ atomic_set(&pfr->num_ring_slots_waiters, 0); ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ sk->sk_family = PF_RING; ++ sk->sk_destruct = ring_sock_destruct; ++#else ++ sk->family = PF_RING; ++ sk->destruct = ring_sock_destruct; ++ sk->num = protocol; ++#endif ++ ++ ring_insert(sk); ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_create() - created\n"); ++#endif ++ ++ return(0); ++ out: ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_DEC_USE_COUNT; ++#endif ++ return err; ++} ++ ++/* *********************************************** */ ++ ++static int ring_release(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct ring_opt *pfr = ring_sk(sk); ++ ++ if(!sk) ++ return 0; ++ ++#if defined(RING_DEBUG) ++ printk("RING: called ring_release\n"); ++#endif ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_release entered\n"); ++#endif ++ ++ ring_remove(sk); ++ ++ sock_orphan(sk); ++ sock->sk = NULL; ++ ++ /* Free the ring buffer */ ++ if(pfr->ring_memory) { ++ struct page *page, *page_end; ++ ++ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1); ++ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++) ++ ClearPageReserved(page); ++ ++ free_pages(pfr->ring_memory, pfr->order); ++ } ++ ++ kfree(pfr); ++ ring_sk(sk) = NULL; ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ skb_queue_purge(&sk->sk_write_queue); ++#endif ++ sock_put(sk); ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_release leaving\n"); ++#endif ++ ++ return 0; ++} ++ ++/* ********************************** */ ++/* ++ * We create a ring for this socket and bind it to the specified device ++ */ ++static int packet_ring_bind(struct sock *sk, struct net_device *dev) ++{ ++ u_int the_slot_len; ++ u_int32_t tot_mem; ++ struct ring_opt *pfr = ring_sk(sk); ++ struct page *page, *page_end; ++ ++ if(!dev) return(-1); ++ ++#if defined(RING_DEBUG) ++ printk("RING: packet_ring_bind(%s) called\n", dev->name); ++#endif ++ ++ /* ********************************************** ++ ++ ************************************* ++ * * ++ * FlowSlotInfo * ++ * * ++ ************************************* <-+ ++ * FlowSlot * | ++ ************************************* | ++ * FlowSlot * | ++ ************************************* +- num_slots ++ * FlowSlot * | ++ ************************************* | ++ * FlowSlot * | ++ ************************************* <-+ ++ ++ ********************************************** */ ++ ++ the_slot_len = sizeof(u_char) /* flowSlot.slot_state */ ++ + sizeof(u_short) /* flowSlot.slot_len */ ++ + bucket_len /* flowSlot.bucket */; ++ ++ tot_mem = sizeof(FlowSlotInfo) + num_slots*the_slot_len; ++ ++ /* ++ Calculate the value of the order parameter used later. ++ See http://www.linuxjournal.com/article.php?sid=1133 ++ */ ++ for(pfr->order = 0;(PAGE_SIZE << pfr->order) < tot_mem; pfr->order++) ; ++ ++ /* ++ We now try to allocate the memory as required. If we fail ++ we try to allocate a smaller amount or memory (hence a ++ smaller ring). ++ */ ++ while((pfr->ring_memory = __get_free_pages(GFP_ATOMIC, pfr->order)) == 0) ++ if(pfr->order-- == 0) ++ break; ++ ++ if(pfr->order == 0) { ++#if defined(RING_DEBUG) ++ printk("ERROR: not enough memory\n"); ++#endif ++ return(-1); ++ } else { ++#if defined(RING_DEBUG) ++ printk("RING: succesfully allocated %lu KB [tot_mem=%d][order=%ld]\n", ++ PAGE_SIZE >> (10 - pfr->order), tot_mem, pfr->order); ++#endif ++ } ++ ++ tot_mem = PAGE_SIZE << pfr->order; ++ memset((char*)pfr->ring_memory, 0, tot_mem); ++ ++ /* Now we need to reserve the pages */ ++ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1); ++ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++) ++ SetPageReserved(page); ++ ++ pfr->slots_info = (FlowSlotInfo*)pfr->ring_memory; ++ pfr->ring_slots = (char*)(pfr->ring_memory+sizeof(FlowSlotInfo)); ++ ++ pfr->slots_info->version = RING_FLOWSLOT_VERSION; ++ pfr->slots_info->slot_len = the_slot_len; ++ pfr->slots_info->tot_slots = (tot_mem-sizeof(FlowSlotInfo))/the_slot_len; ++ pfr->slots_info->tot_mem = tot_mem; ++ pfr->slots_info->sample_rate = sample_rate; ++ ++#if defined(RING_DEBUG) ++ printk("RING: allocated %d slots [slot_len=%d][tot_mem=%u]\n", ++ pfr->slots_info->tot_slots, pfr->slots_info->slot_len, ++ pfr->slots_info->tot_mem); ++#endif ++ ++#ifdef RING_MAGIC ++ { ++ int i; ++ ++ for(i=0; islots_info->tot_slots; i++) { ++ unsigned long idx = i*pfr->slots_info->slot_len; ++ FlowSlot *slot = (FlowSlot*)&pfr->ring_slots[idx]; ++ slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0; ++ } ++ } ++#endif ++ ++ pfr->insert_page_id = 1, pfr->insert_slot_id = 0; ++ ++ /* ++ IMPORTANT ++ Leave this statement here as last one. In fact when ++ the ring_netdev != NULL the socket is ready to be used. ++ */ ++ pfr->ring_netdev = dev; ++ ++ return(0); ++} ++ ++/* ************************************* */ ++ ++/* Bind to a device */ ++static int ring_bind(struct socket *sock, ++ struct sockaddr *sa, int addr_len) ++{ ++ struct sock *sk=sock->sk; ++ struct net_device *dev = NULL; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_bind() called\n"); ++#endif ++ ++ /* ++ * Check legality ++ */ ++ if (addr_len != sizeof(struct sockaddr)) ++ return -EINVAL; ++ if (sa->sa_family != PF_RING) ++ return -EINVAL; ++ ++ /* Safety check: add trailing zero if missing */ ++ sa->sa_data[sizeof(sa->sa_data)-1] = '\0'; ++ ++#if defined(RING_DEBUG) ++ printk("RING: searching device %s\n", sa->sa_data); ++#endif ++ ++ if((dev = __dev_get_by_name(sa->sa_data)) == NULL) { ++#if defined(RING_DEBUG) ++ printk("RING: search failed\n"); ++#endif ++ return(-EINVAL); ++ } else ++ return(packet_ring_bind(sk, dev)); ++} ++ ++/* ************************************* */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ ++volatile void* virt_to_kseg(volatile void* address) { ++ pte_t *pte; ++ pud_t *pud; ++ unsigned long addr = (unsigned long)address; ++ ++ pud = pud_offset(pgd_offset_k((unsigned long) address), ++ (unsigned long) address); ++ ++ /* ++ High-memory support courtesy of ++ Brad Doctor ++ */ ++#if defined(CONFIG_X86_PAE) && (!defined(CONFIG_NOHIGHMEM)) ++ pte = pte_offset_map(pmd_offset(pud, addr), addr); ++#else ++ pte = pmd_offset_map(pud, addr); ++#endif ++ ++ return((volatile void*)pte_page(*pte)); ++} ++ ++#else /* 2.4 */ ++ ++/* http://www.scs.ch/~frey/linux/memorymap.html */ ++volatile void *virt_to_kseg(volatile void *address) ++{ ++ pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; ++ unsigned long va, ret = 0UL; ++ ++ va=VMALLOC_VMADDR((unsigned long)address); ++ ++ /* get the page directory. Use the kernel memory map. */ ++ pgd = pgd_offset_k(va); ++ ++ /* check whether we found an entry */ ++ if (!pgd_none(*pgd)) ++ { ++ /* get the page middle directory */ ++ pmd = pmd_offset(pgd, va); ++ /* check whether we found an entry */ ++ if (!pmd_none(*pmd)) ++ { ++ /* get a pointer to the page table entry */ ++ ptep = pte_offset(pmd, va); ++ pte = *ptep; ++ /* check for a valid page */ ++ if (pte_present(pte)) ++ { ++ /* get the address the page is refering to */ ++ ret = (unsigned long)page_address(pte_page(pte)); ++ /* add the offset within the page to the page address */ ++ ret |= (va & (PAGE_SIZE -1)); ++ } ++ } ++ } ++ return((volatile void *)ret); ++} ++#endif ++ ++/* ************************************* */ ++ ++static int ring_mmap(struct file *file, ++ struct socket *sock, ++ struct vm_area_struct *vma) ++{ ++ struct sock *sk = sock->sk; ++ struct ring_opt *pfr = ring_sk(sk); ++ unsigned long size, start; ++ u_int pagesToMap; ++ char *ptr; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() called\n"); ++#endif ++ ++ if(pfr->ring_memory == 0) { ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() failed: mapping area to an unbound socket\n"); ++#endif ++ return -EINVAL; ++ } ++ ++ size = (unsigned long)(vma->vm_end-vma->vm_start); ++ ++ if(size % PAGE_SIZE) { ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() failed: len is not multiple of PAGE_SIZE\n"); ++#endif ++ return(-EINVAL); ++ } ++ ++ /* if userspace tries to mmap beyond end of our buffer, fail */ ++ if(size > pfr->slots_info->tot_mem) { ++#if defined(RING_DEBUG) ++ printk("proc_mmap() failed: area too large [%ld > %d]\n", size, pfr->slots_info->tot_mem); ++#endif ++ return(-EINVAL); ++ } ++ ++ pagesToMap = size/PAGE_SIZE; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() called. %d pages to map\n", pagesToMap); ++#endif ++ ++#if defined(RING_DEBUG) ++ printk("RING: mmap [slot_len=%d][tot_slots=%d] for ring on device %s\n", ++ pfr->slots_info->slot_len, pfr->slots_info->tot_slots, ++ pfr->ring_netdev->name); ++#endif ++ ++ /* we do not want to have this area swapped out, lock it */ ++ vma->vm_flags |= VM_LOCKED; ++ start = vma->vm_start; ++ ++ /* Ring slots start from page 1 (page 0 is reserved for FlowSlotInfo) */ ++ ptr = (char*)(start+PAGE_SIZE); ++ ++ if(remap_page_range( ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ vma, ++#endif ++ start, ++ __pa(pfr->ring_memory), ++ PAGE_SIZE*pagesToMap, vma->vm_page_prot)) { ++#if defined(RING_DEBUG) ++ printk("remap_page_range() failed\n"); ++#endif ++ return(-EAGAIN); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("proc_mmap(pagesToMap=%d): success.\n", pagesToMap); ++#endif ++ ++ return 0; ++} ++ ++/* ************************************* */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++static int ring_recvmsg(struct kiocb *iocb, struct socket *sock, ++ struct msghdr *msg, size_t len, int flags) ++#else ++ static int ring_recvmsg(struct socket *sock, struct msghdr *msg, int len, ++ int flags, struct scm_cookie *scm) ++#endif ++{ ++ FlowSlot* slot; ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ u_int32_t queued_pkts, num_loops = 0; ++ ++#if defined(RING_DEBUG) ++ printk("ring_recvmsg called\n"); ++#endif ++ ++ slot = get_remove_slot(pfr); ++ ++ while((queued_pkts = num_queued_pkts(pfr)) < MIN_QUEUED_PKTS) { ++ wait_event_interruptible(pfr->ring_slots_waitqueue, 1); ++ ++#if defined(RING_DEBUG) ++ printk("-> ring_recvmsg returning %d [queued_pkts=%d][num_loops=%d]\n", ++ slot->slot_state, queued_pkts, num_loops); ++#endif ++ ++ if(queued_pkts > 0) { ++ if(num_loops++ > MAX_QUEUE_LOOPS) ++ break; ++ } ++ } ++ ++#if defined(RING_DEBUG) ++ if(slot != NULL) ++ printk("ring_recvmsg is returning [queued_pkts=%d][num_loops=%d]\n", ++ queued_pkts, num_loops); ++#endif ++ ++ return(queued_pkts); ++} ++ ++/* ************************************* */ ++ ++unsigned int ring_poll(struct file * file, ++ struct socket *sock, poll_table *wait) ++{ ++ FlowSlot* slot; ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ ++#if defined(RING_DEBUG) ++ printk("poll called\n"); ++#endif ++ ++ slot = get_remove_slot(pfr); ++ ++ if((slot != NULL) && (slot->slot_state == 0)) ++ poll_wait(file, &pfr->ring_slots_waitqueue, wait); ++ ++#if defined(RING_DEBUG) ++ printk("poll returning %d\n", slot->slot_state); ++#endif ++ ++ if((slot != NULL) && (slot->slot_state == 1)) ++ return(POLLIN | POLLRDNORM); ++ else ++ return(0); ++} ++ ++/* ************************************* */ ++ ++int add_to_cluster_list(struct ring_cluster *el, ++ struct sock *sock) { ++ ++ if(el->num_cluster_elements == CLUSTER_LEN) ++ return(-1); /* Cluster full */ ++ ++ ring_sk_datatype(ring_sk(sock))->cluster_id = el->cluster_id; ++ el->sk[el->num_cluster_elements] = sock; ++ el->num_cluster_elements++; ++ return(0); ++} ++ ++/* ************************************* */ ++ ++int remove_from_cluster_list(struct ring_cluster *el, ++ struct sock *sock) { ++ int i, j; ++ ++ for(i=0; isk[i] == sock) { ++ el->num_cluster_elements--; ++ ++ if(el->num_cluster_elements > 0) { ++ /* The cluster contains other elements */ ++ for(j=i; jsk[j] = el->sk[j+1]; ++ ++ el->sk[CLUSTER_LEN-1] = NULL; ++ } else { ++ /* Empty cluster */ ++ memset(el->sk, 0, sizeof(el->sk)); ++ } ++ ++ return(0); ++ } ++ ++ return(-1); /* Not found */ ++} ++ ++/* ************************************* */ ++ ++static int remove_from_cluster(struct sock *sock, ++ struct ring_opt *pfr) ++{ ++ struct ring_cluster *el; ++ ++#if defined(RING_DEBUG) ++ printk("--> remove_from_cluster(%d)\n", pfr->cluster_id); ++#endif ++ ++ if(pfr->cluster_id == 0 /* 0 = No Cluster */) ++ return(0); /* Noting to do */ ++ ++ el = ring_cluster_list; ++ ++ while(el != NULL) { ++ if(el->cluster_id == pfr->cluster_id) { ++ return(remove_from_cluster_list(el, sock)); ++ } else ++ el = el->next; ++ } ++ ++ return(-EINVAL); /* Not found */ ++} ++ ++/* ************************************* */ ++ ++static int add_to_cluster(struct sock *sock, ++ struct ring_opt *pfr, ++ u_short cluster_id) ++{ ++ struct ring_cluster *el; ++ ++#ifndef RING_DEBUG ++ printk("--> add_to_cluster(%d)\n", cluster_id); ++#endif ++ ++ if(cluster_id == 0 /* 0 = No Cluster */) return(-EINVAL); ++ ++ if(pfr->cluster_id != 0) ++ remove_from_cluster(sock, pfr); ++ ++ el = ring_cluster_list; ++ ++ while(el != NULL) { ++ if(el->cluster_id == cluster_id) { ++ return(add_to_cluster_list(el, sock)); ++ } else ++ el = el->next; ++ } ++ ++ /* There's no existing cluster. We need to create one */ ++ if((el = kmalloc(sizeof(struct ring_cluster), GFP_KERNEL)) == NULL) ++ return(-ENOMEM); ++ ++ el->cluster_id = cluster_id; ++ el->num_cluster_elements = 1; ++ el->hashing_mode = cluster_per_flow; /* Default */ ++ el->hashing_id = 0; ++ ++ memset(el->sk, 0, sizeof(el->sk)); ++ el->sk[0] = sock; ++ el->next = ring_cluster_list; ++ ring_cluster_list = el; ++ pfr->cluster_id = cluster_id; ++ ++ return(0); /* 0 = OK */ ++} ++ ++/* ************************************* */ ++ ++/* Code taken/inspired from core/sock.c */ ++static int ring_setsockopt(struct socket *sock, ++ int level, int optname, ++ char *optval, int optlen) ++{ ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ int val, found, ret = 0; ++ u_int cluster_id; ++ char devName[8]; ++ ++ if((optleninsns, fprog.filter, fsize)) ++ break; ++ ++ filter->len = fprog.len; ++ ++ if(sk_chk_filter(filter->insns, filter->len) != 0) { ++ /* Bad filter specified */ ++ kfree(filter); ++ pfr->bpfFilter = NULL; ++ break; ++ } ++ ++ /* get the lock, set the filter, release the lock */ ++ write_lock(&ring_mgmt_lock); ++ pfr->bpfFilter = filter; ++ write_unlock(&ring_mgmt_lock); ++ } ++ ret = 0; ++ break; ++ ++ case SO_DETACH_FILTER: ++ write_lock(&ring_mgmt_lock); ++ found = 1; ++ if(pfr->bpfFilter != NULL) { ++ kfree(pfr->bpfFilter); ++ pfr->bpfFilter = NULL; ++ write_unlock(&ring_mgmt_lock); ++ break; ++ } ++ ret = -ENONET; ++ break; ++ ++ case SO_ADD_TO_CLUSTER: ++ if (optlen!=sizeof(val)) ++ return -EINVAL; ++ ++ if (copy_from_user(&cluster_id, optval, sizeof(cluster_id))) ++ return -EFAULT; ++ ++ write_lock(&ring_mgmt_lock); ++ ret = add_to_cluster(sock->sk, pfr, cluster_id); ++ write_unlock(&ring_mgmt_lock); ++ break; ++ ++ case SO_REMOVE_FROM_CLUSTER: ++ write_lock(&ring_mgmt_lock); ++ ret = remove_from_cluster(sock->sk, pfr); ++ write_unlock(&ring_mgmt_lock); ++ break; ++ ++ case SO_SET_REFLECTOR: ++ if(optlen >= (sizeof(devName)-1)) ++ return -EINVAL; ++ ++ if(optlen > 0) { ++ if(copy_from_user(devName, optval, optlen)) ++ return -EFAULT; ++ } ++ ++ devName[optlen] = '\0'; ++ ++#if defined(RING_DEBUG) ++ printk("+++ SO_SET_REFLECTOR(%s)\n", devName); ++#endif ++ ++ write_lock(&ring_mgmt_lock); ++ pfr->reflector_dev = dev_get_by_name(devName); ++ write_unlock(&ring_mgmt_lock); ++ ++#if defined(RING_DEBUG) ++ if(pfr->reflector_dev != NULL) ++ printk("SO_SET_REFLECTOR(%s): succeded\n", devName); ++ else ++ printk("SO_SET_REFLECTOR(%s): device unknown\n", devName); ++#endif ++ break; ++ ++ default: ++ found = 0; ++ break; ++ } ++ ++ if(found) ++ return(ret); ++ else ++ return(sock_setsockopt(sock, level, optname, optval, optlen)); ++} ++ ++/* ************************************* */ ++ ++static int ring_ioctl(struct socket *sock, ++ unsigned int cmd, unsigned long arg) ++{ ++ switch(cmd) ++ { ++ case SIOCGIFFLAGS: ++ case SIOCSIFFLAGS: ++ case SIOCGIFCONF: ++ case SIOCGIFMETRIC: ++ case SIOCSIFMETRIC: ++ case SIOCGIFMEM: ++ case SIOCSIFMEM: ++ case SIOCGIFMTU: ++ case SIOCSIFMTU: ++ case SIOCSIFLINK: ++ case SIOCGIFHWADDR: ++ case SIOCSIFHWADDR: ++ case SIOCSIFMAP: ++ case SIOCGIFMAP: ++ case SIOCSIFSLAVE: ++ case SIOCGIFSLAVE: ++ case SIOCGIFINDEX: ++ case SIOCGIFNAME: ++ case SIOCGIFCOUNT: ++ case SIOCSIFHWBROADCAST: ++ return(dev_ioctl(cmd,(void *) arg)); ++ ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return 0; ++} ++ ++/* ************************************* */ ++ ++static struct proto_ops ring_ops = { ++ .family = PF_RING, ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ .owner = THIS_MODULE, ++#endif ++ ++ /* Operations that make no sense on ring sockets. */ ++ .connect = sock_no_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = sock_no_getname, ++ .listen = sock_no_listen, ++ .shutdown = sock_no_shutdown, ++ .sendpage = sock_no_sendpage, ++ .sendmsg = sock_no_sendmsg, ++ .getsockopt = sock_no_getsockopt, ++ ++ /* Now the operations that really occur. */ ++ .release = ring_release, ++ .bind = ring_bind, ++ .mmap = ring_mmap, ++ .poll = ring_poll, ++ .setsockopt = ring_setsockopt, ++ .ioctl = ring_ioctl, ++ .recvmsg = ring_recvmsg, ++}; ++ ++/* ************************************ */ ++ ++static struct net_proto_family ring_family_ops = { ++ .family = PF_RING, ++ .create = ring_create, ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ .owner = THIS_MODULE, ++#endif ++}; ++ ++// BD: API changed in 2.6.12, ref: ++// http://svn.clkao.org/svnweb/linux/revision/?rev=28201 ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++static struct proto ring_proto = { ++ .name = "PF_RING", ++ .owner = THIS_MODULE, ++ .obj_size = sizeof(struct sock), ++}; ++#endif ++ ++/* ************************************ */ ++ ++static void __exit ring_exit(void) ++{ ++ struct list_head *ptr; ++ struct ring_element *entry; ++ ++ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ entry = list_entry(ptr, struct ring_element, list); ++ kfree(entry); ++ } ++ ++ while(ring_cluster_list != NULL) { ++ struct ring_cluster *next = ring_cluster_list->next; ++ kfree(ring_cluster_list); ++ ring_cluster_list = next; ++ } ++ ++ set_skb_ring_handler(NULL); ++ set_buffer_ring_handler(NULL); ++ sock_unregister(PF_RING); ++ ++ printk("PF_RING shut down.\n"); ++} ++ ++/* ************************************ */ ++ ++static int __init ring_init(void) ++{ ++ printk("Welcome to PF_RING %s\n(C) 2004 L.Deri \n", ++ RING_VERSION); ++ ++ INIT_LIST_HEAD(&ring_table); ++ ring_cluster_list = NULL; ++ ++ sock_register(&ring_family_ops); ++ ++ set_skb_ring_handler(skb_ring_handler); ++ set_buffer_ring_handler(buffer_ring_handler); ++ ++ if(get_buffer_ring_handler() != buffer_ring_handler) { ++ printk("PF_RING: set_buffer_ring_handler FAILED\n"); ++ ++ set_skb_ring_handler(NULL); ++ set_buffer_ring_handler(NULL); ++ sock_unregister(PF_RING); ++ return -1; ++ } else { ++ printk("PF_RING: bucket length %d bytes\n", bucket_len); ++ printk("PF_RING: ring slots %d\n", num_slots); ++ printk("PF_RING: sample rate %d [1=no sampling]\n", sample_rate); ++ printk("PF_RING: capture TX %s\n", ++ enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]"); ++ printk("PF_RING: transparent mode %s\n", ++ transparent_mode ? "Yes" : "No"); ++ ++ printk("PF_RING initialized correctly.\n"); ++ return 0; ++ } ++} ++ ++module_init(ring_init); ++module_exit(ring_exit); ++MODULE_LICENSE("GPL"); ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++MODULE_ALIAS_NETPROTO(PF_RING); ++#endif diff --git a/target/linux/linux-2.6/patches/generic/104-pf_ring.patch b/target/linux/linux-2.6/patches/generic/104-pf_ring.patch new file mode 100644 index 000000000..759fb2cc9 --- /dev/null +++ b/target/linux/linux-2.6/patches/generic/104-pf_ring.patch @@ -0,0 +1,5299 @@ +diff --unified --recursive --new-file linux-2.6.12.5/include/linux/ring.h linux-2.6.12.5-1-686-smp-ring3/include/linux/ring.h +--- linux-2.6.12.5/include/linux/ring.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/include/linux/ring.h 2005-10-22 23:50:44.951445250 +0200 +@@ -0,0 +1,108 @@ ++/* ++ * Definitions for packet ring ++ * ++ * 2004 - Luca Deri ++ */ ++#ifndef __RING_H ++#define __RING_H ++ ++ ++#define INCLUDE_MAC_INFO ++ ++#ifdef INCLUDE_MAC_INFO ++#define SKB_DISPLACEMENT 14 /* Include MAC address information */ ++#else ++#define SKB_DISPLACEMENT 0 /* Do NOT include MAC address information */ ++#endif ++ ++#define RING_MAGIC ++#define RING_MAGIC_VALUE 0x88 ++#define RING_FLOWSLOT_VERSION 5 ++#define RING_VERSION "3.0" ++ ++#define SO_ADD_TO_CLUSTER 99 ++#define SO_REMOVE_FROM_CLUSTER 100 ++#define SO_SET_REFLECTOR 101 ++ ++/* *********************************** */ ++ ++#ifndef HAVE_PCAP ++struct pcap_pkthdr { ++ struct timeval ts; /* time stamp */ ++ u_int32_t caplen; /* length of portion present */ ++ u_int32_t len; /* length this packet (off wire) */ ++}; ++#endif ++ ++/* *********************************** */ ++ ++enum cluster_type { ++ cluster_per_flow = 0, ++ cluster_round_robin ++}; ++ ++/* *********************************** */ ++ ++#define RING_MIN_SLOT_SIZE (60+sizeof(struct pcap_pkthdr)) ++#define RING_MAX_SLOT_SIZE (1514+sizeof(struct pcap_pkthdr)) ++ ++/* *********************************** */ ++ ++typedef struct flowSlotInfo { ++ u_int16_t version, sample_rate; ++ u_int32_t tot_slots, slot_len, tot_mem; ++ ++ u_int64_t tot_pkts, tot_lost; ++ u_int64_t tot_insert, tot_read; ++ u_int16_t insert_idx; ++ u_int16_t remove_idx; ++} FlowSlotInfo; ++ ++/* *********************************** */ ++ ++typedef struct flowSlot { ++#ifdef RING_MAGIC ++ u_char magic; /* It must alwasy be zero */ ++#endif ++ u_char slot_state; /* 0=empty, 1=full */ ++ u_char bucket; /* bucket[bucketLen] */ ++} FlowSlot; ++ ++/* *********************************** */ ++ ++#ifdef __KERNEL__ ++ ++FlowSlotInfo* getRingPtr(void); ++int allocateRing(char *deviceName, u_int numSlots, ++ u_int bucketLen, u_int sampleRate); ++unsigned int pollRing(struct file *fp, struct poll_table_struct * wait); ++void deallocateRing(void); ++ ++/* ************************* */ ++ ++typedef int (*handle_ring_skb)(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb); ++extern handle_ring_skb get_skb_ring_handler(void); ++extern void set_skb_ring_handler(handle_ring_skb the_handler); ++extern void do_skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb); ++ ++typedef int (*handle_ring_buffer)(struct net_device *dev, ++ char *data, int len); ++extern handle_ring_buffer get_buffer_ring_handler(void); ++extern void set_buffer_ring_handler(handle_ring_buffer the_handler); ++extern int do_buffer_ring_handler(struct net_device *dev, ++ char *data, int len); ++#endif /* __KERNEL__ */ ++ ++/* *********************************** */ ++ ++#define PF_RING 27 /* Packet Ring */ ++#define SOCK_RING PF_RING ++ ++/* ioctl() */ ++#define SIORINGPOLL 0x8888 ++ ++/* *********************************** */ ++ ++#endif /* __RING_H */ +diff --unified --recursive --new-file linux-2.6.12.5/net/Kconfig linux-2.6.12.5-1-686-smp-ring3/net/Kconfig +--- linux-2.6.12.5/net/Kconfig 2005-08-15 02:20:18.000000000 +0200 ++++ linux-2.6.12.5-1-686-smp-ring3/net/Kconfig 2005-10-22 23:50:45.535481750 +0200 +@@ -72,6 +72,7 @@ + + Say Y unless you know what you are doing. + ++source "net/ring/Kconfig" + config INET + bool "TCP/IP networking" + ---help--- +diff --unified --recursive --new-file linux-2.6.12.5/net/Makefile linux-2.6.12.5-1-686-smp-ring3/net/Makefile +--- linux-2.6.12.5/net/Makefile 2005-08-15 02:20:18.000000000 +0200 ++++ linux-2.6.12.5-1-686-smp-ring3/net/Makefile 2005-10-22 23:50:45.491479000 +0200 +@@ -41,6 +41,7 @@ + obj-$(CONFIG_DECNET) += decnet/ + obj-$(CONFIG_ECONET) += econet/ + obj-$(CONFIG_VLAN_8021Q) += 8021q/ ++obj-$(CONFIG_RING) += ring/ + obj-$(CONFIG_IP_SCTP) += sctp/ + + ifeq ($(CONFIG_NET),y) +diff --unified --recursive --new-file linux-2.6.12.5/net/Makefile.ORG linux-2.6.12.5-1-686-smp-ring3/net/Makefile.ORG +--- linux-2.6.12.5/net/Makefile.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/Makefile.ORG 2005-10-22 23:50:45.483478500 +0200 +@@ -0,0 +1,48 @@ ++# ++# Makefile for the linux networking. ++# ++# 2 Sep 2000, Christoph Hellwig ++# Rewritten to use lists instead of if-statements. ++# ++ ++obj-y := nonet.o ++ ++obj-$(CONFIG_NET) := socket.o core/ ++ ++tmp-$(CONFIG_COMPAT) := compat.o ++obj-$(CONFIG_NET) += $(tmp-y) ++ ++# LLC has to be linked before the files in net/802/ ++obj-$(CONFIG_LLC) += llc/ ++obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ ++obj-$(CONFIG_INET) += ipv4/ ++obj-$(CONFIG_XFRM) += xfrm/ ++obj-$(CONFIG_UNIX) += unix/ ++ifneq ($(CONFIG_IPV6),) ++obj-y += ipv6/ ++endif ++obj-$(CONFIG_PACKET) += packet/ ++obj-$(CONFIG_NET_KEY) += key/ ++obj-$(CONFIG_NET_SCHED) += sched/ ++obj-$(CONFIG_BRIDGE) += bridge/ ++obj-$(CONFIG_IPX) += ipx/ ++obj-$(CONFIG_ATALK) += appletalk/ ++obj-$(CONFIG_WAN_ROUTER) += wanrouter/ ++obj-$(CONFIG_X25) += x25/ ++obj-$(CONFIG_LAPB) += lapb/ ++obj-$(CONFIG_NETROM) += netrom/ ++obj-$(CONFIG_ROSE) += rose/ ++obj-$(CONFIG_AX25) += ax25/ ++obj-$(CONFIG_IRDA) += irda/ ++obj-$(CONFIG_BT) += bluetooth/ ++obj-$(CONFIG_SUNRPC) += sunrpc/ ++obj-$(CONFIG_RXRPC) += rxrpc/ ++obj-$(CONFIG_ATM) += atm/ ++obj-$(CONFIG_DECNET) += decnet/ ++obj-$(CONFIG_ECONET) += econet/ ++obj-$(CONFIG_VLAN_8021Q) += 8021q/ ++obj-$(CONFIG_IP_SCTP) += sctp/ ++ ++ifeq ($(CONFIG_NET),y) ++obj-$(CONFIG_SYSCTL) += sysctl_net.o ++endif +diff --unified --recursive --new-file linux-2.6.12.5/net/core/dev.c linux-2.6.12.5-1-686-smp-ring3/net/core/dev.c +--- linux-2.6.12.5/net/core/dev.c 2005-08-15 02:20:18.000000000 +0200 ++++ linux-2.6.12.5-1-686-smp-ring3/net/core/dev.c 2005-10-22 23:50:45.479478250 +0200 +@@ -115,6 +115,56 @@ + #endif /* CONFIG_NET_RADIO */ + #include + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ ++/* #define RING_DEBUG */ ++ ++#include ++#include ++ ++static handle_ring_skb ring_handler = NULL; ++ ++handle_ring_skb get_skb_ring_handler() { return(ring_handler); } ++ ++void set_skb_ring_handler(handle_ring_skb the_handler) { ++ ring_handler = the_handler; ++} ++ ++void do_skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb) { ++ if(ring_handler) ++ ring_handler(skb, recv_packet, real_skb); ++} ++ ++/* ******************* */ ++ ++static handle_ring_buffer buffer_ring_handler = NULL; ++ ++handle_ring_buffer get_buffer_ring_handler() { return(buffer_ring_handler); } ++ ++void set_buffer_ring_handler(handle_ring_buffer the_handler) { ++ buffer_ring_handler = the_handler; ++} ++ ++int do_buffer_ring_handler(struct net_device *dev, char *data, int len) { ++ if(buffer_ring_handler) { ++ buffer_ring_handler(dev, data, len); ++ return(1); ++ } else ++ return(0); ++} ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++EXPORT_SYMBOL(get_skb_ring_handler); ++EXPORT_SYMBOL(set_skb_ring_handler); ++EXPORT_SYMBOL(do_skb_ring_handler); ++ ++EXPORT_SYMBOL(get_buffer_ring_handler); ++EXPORT_SYMBOL(set_buffer_ring_handler); ++EXPORT_SYMBOL(do_buffer_ring_handler); ++#endif ++ ++#endif + /* This define, if set, will randomly drop a packet when congestion + * is more than moderate. It helps fairness in the multi-interface + * case when one of them is a hog, but it kills performance for the +@@ -1293,6 +1343,10 @@ + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); + #endif + if (q->enqueue) { ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler) ring_handler(skb, 0, 1); ++#endif /* CONFIG_RING */ ++ + /* Grab device queue */ + spin_lock(&dev->queue_lock); + +@@ -1509,6 +1563,13 @@ + + preempt_disable(); + err = netif_rx(skb); ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler && ring_handler(skb, 1, 1)) { ++ /* The packet has been copied into a ring */ ++ return(NET_RX_SUCCESS); ++ } ++#endif /* CONFIG_RING */ ++ + if (local_softirq_pending()) + do_softirq(); + preempt_enable(); +@@ -1655,6 +1716,13 @@ + int ret = NET_RX_DROP; + unsigned short type; + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler && ring_handler(skb, 1, 1)) { ++ /* The packet has been copied into a ring */ ++ return(NET_RX_SUCCESS); ++ } ++#endif /* CONFIG_RING */ ++ + /* if we've gotten here through NAPI, check netpoll */ + if (skb->dev->poll && netpoll_rx(skb)) + return NET_RX_DROP; +diff --unified --recursive --new-file linux-2.6.12.5/net/core/dev.c.ORG linux-2.6.12.5-1-686-smp-ring3/net/core/dev.c.ORG +--- linux-2.6.12.5/net/core/dev.c.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/core/dev.c.ORG 2005-10-22 23:50:45.203461000 +0200 +@@ -0,0 +1,3385 @@ ++/* ++ * NET3 Protocol independent device support routines. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Derived from the non IP parts of dev.c 1.0.19 ++ * Authors: Ross Biro ++ * Fred N. van Kempen, ++ * Mark Evans, ++ * ++ * Additional Authors: ++ * Florian la Roche ++ * Alan Cox ++ * David Hinds ++ * Alexey Kuznetsov ++ * Adam Sulmicki ++ * Pekka Riikonen ++ * ++ * Changes: ++ * D.J. Barrow : Fixed bug where dev->refcnt gets set ++ * to 2 if register_netdev gets called ++ * before net_dev_init & also removed a ++ * few lines of code in the process. ++ * Alan Cox : device private ioctl copies fields back. ++ * Alan Cox : Transmit queue code does relevant ++ * stunts to keep the queue safe. ++ * Alan Cox : Fixed double lock. ++ * Alan Cox : Fixed promisc NULL pointer trap ++ * ???????? : Support the full private ioctl range ++ * Alan Cox : Moved ioctl permission check into ++ * drivers ++ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI ++ * Alan Cox : 100 backlog just doesn't cut it when ++ * you start doing multicast video 8) ++ * Alan Cox : Rewrote net_bh and list manager. ++ * Alan Cox : Fix ETH_P_ALL echoback lengths. ++ * Alan Cox : Took out transmit every packet pass ++ * Saved a few bytes in the ioctl handler ++ * Alan Cox : Network driver sets packet type before ++ * calling netif_rx. Saves a function ++ * call a packet. ++ * Alan Cox : Hashed net_bh() ++ * Richard Kooijman: Timestamp fixes. ++ * Alan Cox : Wrong field in SIOCGIFDSTADDR ++ * Alan Cox : Device lock protection. ++ * Alan Cox : Fixed nasty side effect of device close ++ * changes. ++ * Rudi Cilibrasi : Pass the right thing to ++ * set_mac_address() ++ * Dave Miller : 32bit quantity for the device lock to ++ * make it work out on a Sparc. ++ * Bjorn Ekwall : Added KERNELD hack. ++ * Alan Cox : Cleaned up the backlog initialise. ++ * Craig Metz : SIOCGIFCONF fix if space for under ++ * 1 device. ++ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there ++ * is no device open function. ++ * Andi Kleen : Fix error reporting for SIOCGIFCONF ++ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF ++ * Cyrus Durgin : Cleaned for KMOD ++ * Adam Sulmicki : Bug Fix : Network Device Unload ++ * A network device unload needs to purge ++ * the backlog queue. ++ * Paul Rusty Russell : SIOCSIFNAME ++ * Pekka Riikonen : Netdev boot-time settings code ++ * Andrew Morton : Make unregister_netdevice wait ++ * indefinitely on dev->refcnt ++ * J Hadi Salim : - Backlog queue sampling ++ * - netif_rx() feedback ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_NET_RADIO ++#include /* Note : will define WIRELESS_EXT */ ++#include ++#endif /* CONFIG_NET_RADIO */ ++#include ++ ++/* This define, if set, will randomly drop a packet when congestion ++ * is more than moderate. It helps fairness in the multi-interface ++ * case when one of them is a hog, but it kills performance for the ++ * single interface case so it is off now by default. ++ */ ++#undef RAND_LIE ++ ++/* Setting this will sample the queue lengths and thus congestion ++ * via a timer instead of as each packet is received. ++ */ ++#undef OFFLINE_SAMPLE ++ ++/* ++ * The list of packet types we will receive (as opposed to discard) ++ * and the routines to invoke. ++ * ++ * Why 16. Because with 16 the only overlap we get on a hash of the ++ * low nibble of the protocol value is RARP/SNAP/X.25. ++ * ++ * NOTE: That is no longer true with the addition of VLAN tags. Not ++ * sure which should go first, but I bet it won't make much ++ * difference if we are running VLANs. The good news is that ++ * this protocol won't be in the list unless compiled in, so ++ * the average user (w/out VLANs) will not be adversly affected. ++ * --BLG ++ * ++ * 0800 IP ++ * 8100 802.1Q VLAN ++ * 0001 802.3 ++ * 0002 AX.25 ++ * 0004 802.2 ++ * 8035 RARP ++ * 0005 SNAP ++ * 0805 X.25 ++ * 0806 ARP ++ * 8137 IPX ++ * 0009 Localtalk ++ * 86DD IPv6 ++ */ ++ ++static DEFINE_SPINLOCK(ptype_lock); ++static struct list_head ptype_base[16]; /* 16 way hashed list */ ++static struct list_head ptype_all; /* Taps */ ++ ++#ifdef OFFLINE_SAMPLE ++static void sample_queue(unsigned long dummy); ++static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); ++#endif ++ ++/* ++ * The @dev_base list is protected by @dev_base_lock and the rtln ++ * semaphore. ++ * ++ * Pure readers hold dev_base_lock for reading. ++ * ++ * Writers must hold the rtnl semaphore while they loop through the ++ * dev_base list, and hold dev_base_lock for writing when they do the ++ * actual updates. This allows pure readers to access the list even ++ * while a writer is preparing to update it. ++ * ++ * To put it another way, dev_base_lock is held for writing only to ++ * protect against pure readers; the rtnl semaphore provides the ++ * protection against other writers. ++ * ++ * See, for example usages, register_netdevice() and ++ * unregister_netdevice(), which must be called with the rtnl ++ * semaphore held. ++ */ ++struct net_device *dev_base; ++static struct net_device **dev_tail = &dev_base; ++DEFINE_RWLOCK(dev_base_lock); ++ ++EXPORT_SYMBOL(dev_base); ++EXPORT_SYMBOL(dev_base_lock); ++ ++#define NETDEV_HASHBITS 8 ++static struct hlist_head dev_name_head[1<type == htons(ETH_P_ALL)) { ++ netdev_nit++; ++ list_add_rcu(&pt->list, &ptype_all); ++ } else { ++ hash = ntohs(pt->type) & 15; ++ list_add_rcu(&pt->list, &ptype_base[hash]); ++ } ++ spin_unlock_bh(&ptype_lock); ++} ++ ++extern void linkwatch_run_queue(void); ++ ++ ++ ++/** ++ * __dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ * ++ * The packet type might still be in use by receivers ++ * and must not be freed until after all the CPU's have gone ++ * through a quiescent state. ++ */ ++void __dev_remove_pack(struct packet_type *pt) ++{ ++ struct list_head *head; ++ struct packet_type *pt1; ++ ++ spin_lock_bh(&ptype_lock); ++ ++ if (pt->type == htons(ETH_P_ALL)) { ++ netdev_nit--; ++ head = &ptype_all; ++ } else ++ head = &ptype_base[ntohs(pt->type) & 15]; ++ ++ list_for_each_entry(pt1, head, list) { ++ if (pt == pt1) { ++ list_del_rcu(&pt->list); ++ goto out; ++ } ++ } ++ ++ printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); ++out: ++ spin_unlock_bh(&ptype_lock); ++} ++/** ++ * dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ * ++ * This call sleeps to guarantee that no CPU is looking at the packet ++ * type after return. ++ */ ++void dev_remove_pack(struct packet_type *pt) ++{ ++ __dev_remove_pack(pt); ++ ++ synchronize_net(); ++} ++ ++/****************************************************************************** ++ ++ Device Boot-time Settings Routines ++ ++*******************************************************************************/ ++ ++/* Boot time configuration table */ ++static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; ++ ++/** ++ * netdev_boot_setup_add - add new setup entry ++ * @name: name of the device ++ * @map: configured settings for the device ++ * ++ * Adds new setup entry to the dev_boot_setup list. The function ++ * returns 0 on error and 1 on success. This is a generic routine to ++ * all netdevices. ++ */ ++static int netdev_boot_setup_add(char *name, struct ifmap *map) ++{ ++ struct netdev_boot_setup *s; ++ int i; ++ ++ s = dev_boot_setup; ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { ++ if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { ++ memset(s[i].name, 0, sizeof(s[i].name)); ++ strcpy(s[i].name, name); ++ memcpy(&s[i].map, map, sizeof(s[i].map)); ++ break; ++ } ++ } ++ ++ return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; ++} ++ ++/** ++ * netdev_boot_setup_check - check boot time settings ++ * @dev: the netdevice ++ * ++ * Check boot time settings for the device. ++ * The found settings are set for the device to be used ++ * later in the device probing. ++ * Returns 0 if no settings found, 1 if they are. ++ */ ++int netdev_boot_setup_check(struct net_device *dev) ++{ ++ struct netdev_boot_setup *s = dev_boot_setup; ++ int i; ++ ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { ++ if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && ++ !strncmp(dev->name, s[i].name, strlen(s[i].name))) { ++ dev->irq = s[i].map.irq; ++ dev->base_addr = s[i].map.base_addr; ++ dev->mem_start = s[i].map.mem_start; ++ dev->mem_end = s[i].map.mem_end; ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++ ++/** ++ * netdev_boot_base - get address from boot time settings ++ * @prefix: prefix for network device ++ * @unit: id for network device ++ * ++ * Check boot time settings for the base address of device. ++ * The found settings are set for the device to be used ++ * later in the device probing. ++ * Returns 0 if no settings found. ++ */ ++unsigned long netdev_boot_base(const char *prefix, int unit) ++{ ++ const struct netdev_boot_setup *s = dev_boot_setup; ++ char name[IFNAMSIZ]; ++ int i; ++ ++ sprintf(name, "%s%d", prefix, unit); ++ ++ /* ++ * If device already registered then return base of 1 ++ * to indicate not to probe for this interface ++ */ ++ if (__dev_get_by_name(name)) ++ return 1; ++ ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) ++ if (!strcmp(name, s[i].name)) ++ return s[i].map.base_addr; ++ return 0; ++} ++ ++/* ++ * Saves at boot time configured settings for any netdevice. ++ */ ++int __init netdev_boot_setup(char *str) ++{ ++ int ints[5]; ++ struct ifmap map; ++ ++ str = get_options(str, ARRAY_SIZE(ints), ints); ++ if (!str || !*str) ++ return 0; ++ ++ /* Save settings */ ++ memset(&map, 0, sizeof(map)); ++ if (ints[0] > 0) ++ map.irq = ints[1]; ++ if (ints[0] > 1) ++ map.base_addr = ints[2]; ++ if (ints[0] > 2) ++ map.mem_start = ints[3]; ++ if (ints[0] > 3) ++ map.mem_end = ints[4]; ++ ++ /* Add new entry to the list */ ++ return netdev_boot_setup_add(str, &map); ++} ++ ++__setup("netdev=", netdev_boot_setup); ++ ++/******************************************************************************* ++ ++ Device Interface Subroutines ++ ++*******************************************************************************/ ++ ++/** ++ * __dev_get_by_name - find a device by its name ++ * @name: name to find ++ * ++ * Find an interface by name. Must be called under RTNL semaphore ++ * or @dev_base_lock. If the name is found a pointer to the device ++ * is returned. If the name is not found then %NULL is returned. The ++ * reference counters are not incremented so the caller must be ++ * careful with locks. ++ */ ++ ++struct net_device *__dev_get_by_name(const char *name) ++{ ++ struct hlist_node *p; ++ ++ hlist_for_each(p, dev_name_hash(name)) { ++ struct net_device *dev ++ = hlist_entry(p, struct net_device, name_hlist); ++ if (!strncmp(dev->name, name, IFNAMSIZ)) ++ return dev; ++ } ++ return NULL; ++} ++ ++/** ++ * dev_get_by_name - find a device by its name ++ * @name: name to find ++ * ++ * Find an interface by name. This can be called from any ++ * context and does its own locking. The returned handle has ++ * the usage count incremented and the caller must use dev_put() to ++ * release it when it is no longer needed. %NULL is returned if no ++ * matching device is found. ++ */ ++ ++struct net_device *dev_get_by_name(const char *name) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(name); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * __dev_get_by_index - find a device by its ifindex ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not ++ * had its reference counter increased so the caller must be careful ++ * about locking. The caller must hold either the RTNL semaphore ++ * or @dev_base_lock. ++ */ ++ ++struct net_device *__dev_get_by_index(int ifindex) ++{ ++ struct hlist_node *p; ++ ++ hlist_for_each(p, dev_index_hash(ifindex)) { ++ struct net_device *dev ++ = hlist_entry(p, struct net_device, index_hlist); ++ if (dev->ifindex == ifindex) ++ return dev; ++ } ++ return NULL; ++} ++ ++ ++/** ++ * dev_get_by_index - find a device by its ifindex ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns NULL if the device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device *dev_get_by_index(int ifindex) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_index(ifindex); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * dev_getbyhwaddr - find a device by its hardware address ++ * @type: media type of device ++ * @ha: hardware address ++ * ++ * Search for an interface by MAC address. Returns NULL if the device ++ * is not found or a pointer to the device. The caller must hold the ++ * rtnl semaphore. The returned device has not had its ref count increased ++ * and the caller must therefore be careful about locking ++ * ++ * BUGS: ++ * If the API was consistent this would be __dev_get_by_hwaddr ++ */ ++ ++struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) ++{ ++ struct net_device *dev; ++ ++ ASSERT_RTNL(); ++ ++ for (dev = dev_base; dev; dev = dev->next) ++ if (dev->type == type && ++ !memcmp(dev->dev_addr, ha, dev->addr_len)) ++ break; ++ return dev; ++} ++ ++struct net_device *dev_getfirstbyhwtype(unsigned short type) ++{ ++ struct net_device *dev; ++ ++ rtnl_lock(); ++ for (dev = dev_base; dev; dev = dev->next) { ++ if (dev->type == type) { ++ dev_hold(dev); ++ break; ++ } ++ } ++ rtnl_unlock(); ++ return dev; ++} ++ ++EXPORT_SYMBOL(dev_getfirstbyhwtype); ++ ++/** ++ * dev_get_by_flags - find any device with given flags ++ * @if_flags: IFF_* values ++ * @mask: bitmask of bits in if_flags to check ++ * ++ * Search for any interface with the given flags. Returns NULL if a device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (((dev->flags ^ if_flags) & mask) == 0) { ++ dev_hold(dev); ++ break; ++ } ++ } ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * dev_valid_name - check if name is okay for network device ++ * @name: name string ++ * ++ * Network device names need to be valid file names to ++ * to allow sysfs to work ++ */ ++static int dev_valid_name(const char *name) ++{ ++ return !(*name == '\0' ++ || !strcmp(name, ".") ++ || !strcmp(name, "..") ++ || strchr(name, '/')); ++} ++ ++/** ++ * dev_alloc_name - allocate a name for a device ++ * @dev: device ++ * @name: name format string ++ * ++ * Passed a format string - eg "lt%d" it will try and find a suitable ++ * id. Not efficient for many devices, not called a lot. The caller ++ * must hold the dev_base or rtnl lock while allocating the name and ++ * adding the device in order to avoid duplicates. Returns the number ++ * of the unit assigned or a negative errno code. ++ */ ++ ++int dev_alloc_name(struct net_device *dev, const char *name) ++{ ++ int i = 0; ++ char buf[IFNAMSIZ]; ++ const char *p; ++ const int max_netdevices = 8*PAGE_SIZE; ++ long *inuse; ++ struct net_device *d; ++ ++ p = strnchr(name, IFNAMSIZ-1, '%'); ++ if (p) { ++ /* ++ * Verify the string as this thing may have come from ++ * the user. There must be either one "%d" and no other "%" ++ * characters. ++ */ ++ if (p[1] != 'd' || strchr(p + 2, '%')) ++ return -EINVAL; ++ ++ /* Use one page as a bit array of possible slots */ ++ inuse = (long *) get_zeroed_page(GFP_ATOMIC); ++ if (!inuse) ++ return -ENOMEM; ++ ++ for (d = dev_base; d; d = d->next) { ++ if (!sscanf(d->name, name, &i)) ++ continue; ++ if (i < 0 || i >= max_netdevices) ++ continue; ++ ++ /* avoid cases where sscanf is not exact inverse of printf */ ++ snprintf(buf, sizeof(buf), name, i); ++ if (!strncmp(buf, d->name, IFNAMSIZ)) ++ set_bit(i, inuse); ++ } ++ ++ i = find_first_zero_bit(inuse, max_netdevices); ++ free_page((unsigned long) inuse); ++ } ++ ++ snprintf(buf, sizeof(buf), name, i); ++ if (!__dev_get_by_name(buf)) { ++ strlcpy(dev->name, buf, IFNAMSIZ); ++ return i; ++ } ++ ++ /* It is possible to run out of possible slots ++ * when the name is long and there isn't enough space left ++ * for the digits, or if all bits are used. ++ */ ++ return -ENFILE; ++} ++ ++ ++/** ++ * dev_change_name - change name of a device ++ * @dev: device ++ * @newname: name (or format string) must be at least IFNAMSIZ ++ * ++ * Change name of a device, can pass format strings "eth%d". ++ * for wildcarding. ++ */ ++int dev_change_name(struct net_device *dev, char *newname) ++{ ++ int err = 0; ++ ++ ASSERT_RTNL(); ++ ++ if (dev->flags & IFF_UP) ++ return -EBUSY; ++ ++ if (!dev_valid_name(newname)) ++ return -EINVAL; ++ ++ if (strchr(newname, '%')) { ++ err = dev_alloc_name(dev, newname); ++ if (err < 0) ++ return err; ++ strcpy(newname, dev->name); ++ } ++ else if (__dev_get_by_name(newname)) ++ return -EEXIST; ++ else ++ strlcpy(dev->name, newname, IFNAMSIZ); ++ ++ err = class_device_rename(&dev->class_dev, dev->name); ++ if (!err) { ++ hlist_del(&dev->name_hlist); ++ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); ++ } ++ ++ return err; ++} ++ ++/** ++ * netdev_features_change - device changes fatures ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed features. ++ */ ++void netdev_features_change(struct net_device *dev) ++{ ++ notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev); ++} ++EXPORT_SYMBOL(netdev_features_change); ++ ++/** ++ * netdev_state_change - device changes state ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed state. This function calls ++ * the notifier chains for netdev_chain and sends a NEWLINK message ++ * to the routing socket. ++ */ ++void netdev_state_change(struct net_device *dev) ++{ ++ if (dev->flags & IFF_UP) { ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); ++ rtmsg_ifinfo(RTM_NEWLINK, dev, 0); ++ } ++} ++ ++/** ++ * dev_load - load a network module ++ * @name: name of interface ++ * ++ * If a network interface is not present and the process has suitable ++ * privileges this function loads the module. If module loading is not ++ * available in this kernel then it becomes a nop. ++ */ ++ ++void dev_load(const char *name) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(name); ++ read_unlock(&dev_base_lock); ++ ++ if (!dev && capable(CAP_SYS_MODULE)) ++ request_module("%s", name); ++} ++ ++static int default_rebuild_header(struct sk_buff *skb) ++{ ++ printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", ++ skb->dev ? skb->dev->name : "NULL!!!"); ++ kfree_skb(skb); ++ return 1; ++} ++ ++ ++/** ++ * dev_open - prepare an interface for use. ++ * @dev: device to open ++ * ++ * Takes a device from down to up state. The device's private open ++ * function is invoked and then the multicast lists are loaded. Finally ++ * the device is moved into the up state and a %NETDEV_UP message is ++ * sent to the netdev notifier chain. ++ * ++ * Calling this function on an active interface is a nop. On a failure ++ * a negative errno code is returned. ++ */ ++int dev_open(struct net_device *dev) ++{ ++ int ret = 0; ++ ++ /* ++ * Is it already up? ++ */ ++ ++ if (dev->flags & IFF_UP) ++ return 0; ++ ++ /* ++ * Is it even present? ++ */ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ /* ++ * Call device private open method ++ */ ++ set_bit(__LINK_STATE_START, &dev->state); ++ if (dev->open) { ++ ret = dev->open(dev); ++ if (ret) ++ clear_bit(__LINK_STATE_START, &dev->state); ++ } ++ ++ /* ++ * If it went open OK then: ++ */ ++ ++ if (!ret) { ++ /* ++ * Set the flags. ++ */ ++ dev->flags |= IFF_UP; ++ ++ /* ++ * Initialize multicasting status ++ */ ++ dev_mc_upload(dev); ++ ++ /* ++ * Wakeup transmit queue engine ++ */ ++ dev_activate(dev); ++ ++ /* ++ * ... and announce new interface. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_UP, dev); ++ } ++ return ret; ++} ++ ++/** ++ * dev_close - shutdown an interface. ++ * @dev: device to shutdown ++ * ++ * This function moves an active device into down state. A ++ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device ++ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier ++ * chain. ++ */ ++int dev_close(struct net_device *dev) ++{ ++ if (!(dev->flags & IFF_UP)) ++ return 0; ++ ++ /* ++ * Tell people we are going down, so that they can ++ * prepare to death, when device is still operating. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); ++ ++ dev_deactivate(dev); ++ ++ clear_bit(__LINK_STATE_START, &dev->state); ++ ++ /* Synchronize to scheduled poll. We cannot touch poll list, ++ * it can be even on different cpu. So just clear netif_running(), ++ * and wait when poll really will happen. Actually, the best place ++ * for this is inside dev->stop() after device stopped its irq ++ * engine, but this requires more changes in devices. */ ++ ++ smp_mb__after_clear_bit(); /* Commit netif_running(). */ ++ while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { ++ /* No hurry. */ ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(1); ++ } ++ ++ /* ++ * Call the device specific close. This cannot fail. ++ * Only if device is UP ++ * ++ * We allow it to be called even after a DETACH hot-plug ++ * event. ++ */ ++ if (dev->stop) ++ dev->stop(dev); ++ ++ /* ++ * Device is now down. ++ */ ++ ++ dev->flags &= ~IFF_UP; ++ ++ /* ++ * Tell people we are down ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); ++ ++ return 0; ++} ++ ++ ++/* ++ * Device change register/unregister. These are not inline or static ++ * as we export them to the world. ++ */ ++ ++/** ++ * register_netdevice_notifier - register a network notifier block ++ * @nb: notifier ++ * ++ * Register a notifier to be called when network device events occur. ++ * The notifier passed is linked into the kernel structures and must ++ * not be reused until it has been unregistered. A negative errno code ++ * is returned on a failure. ++ * ++ * When registered all registration and up events are replayed ++ * to the new notifier to allow device to have a race free ++ * view of the network device list. ++ */ ++ ++int register_netdevice_notifier(struct notifier_block *nb) ++{ ++ struct net_device *dev; ++ int err; ++ ++ rtnl_lock(); ++ err = notifier_chain_register(&netdev_chain, nb); ++ if (!err) { ++ for (dev = dev_base; dev; dev = dev->next) { ++ nb->notifier_call(nb, NETDEV_REGISTER, dev); ++ ++ if (dev->flags & IFF_UP) ++ nb->notifier_call(nb, NETDEV_UP, dev); ++ } ++ } ++ rtnl_unlock(); ++ return err; ++} ++ ++/** ++ * unregister_netdevice_notifier - unregister a network notifier block ++ * @nb: notifier ++ * ++ * Unregister a notifier previously registered by ++ * register_netdevice_notifier(). The notifier is unlinked into the ++ * kernel structures and may then be reused. A negative errno code ++ * is returned on a failure. ++ */ ++ ++int unregister_netdevice_notifier(struct notifier_block *nb) ++{ ++ return notifier_chain_unregister(&netdev_chain, nb); ++} ++ ++/** ++ * call_netdevice_notifiers - call all network notifier blocks ++ * @val: value passed unmodified to notifier function ++ * @v: pointer passed unmodified to notifier function ++ * ++ * Call all network notifier blocks. Parameters and return value ++ * are as for notifier_call_chain(). ++ */ ++ ++int call_netdevice_notifiers(unsigned long val, void *v) ++{ ++ return notifier_call_chain(&netdev_chain, val, v); ++} ++ ++/* When > 0 there are consumers of rx skb time stamps */ ++static atomic_t netstamp_needed = ATOMIC_INIT(0); ++ ++void net_enable_timestamp(void) ++{ ++ atomic_inc(&netstamp_needed); ++} ++ ++void net_disable_timestamp(void) ++{ ++ atomic_dec(&netstamp_needed); ++} ++ ++static inline void net_timestamp(struct timeval *stamp) ++{ ++ if (atomic_read(&netstamp_needed)) ++ do_gettimeofday(stamp); ++ else { ++ stamp->tv_sec = 0; ++ stamp->tv_usec = 0; ++ } ++} ++ ++/* ++ * Support routine. Sends outgoing frames to any network ++ * taps currently in use. ++ */ ++ ++void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct packet_type *ptype; ++ net_timestamp(&skb->stamp); ++ ++ rcu_read_lock(); ++ list_for_each_entry_rcu(ptype, &ptype_all, list) { ++ /* Never send packets back to the socket ++ * they originated from - MvS (miquels@drinkel.ow.org) ++ */ ++ if ((ptype->dev == dev || !ptype->dev) && ++ (ptype->af_packet_priv == NULL || ++ (struct sock *)ptype->af_packet_priv != skb->sk)) { ++ struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); ++ if (!skb2) ++ break; ++ ++ /* skb->nh should be correctly ++ set by sender, so that the second statement is ++ just protection against buggy protocols. ++ */ ++ skb2->mac.raw = skb2->data; ++ ++ if (skb2->nh.raw < skb2->data || ++ skb2->nh.raw > skb2->tail) { ++ if (net_ratelimit()) ++ printk(KERN_CRIT "protocol %04x is " ++ "buggy, dev %s\n", ++ skb2->protocol, dev->name); ++ skb2->nh.raw = skb2->data; ++ } ++ ++ skb2->h.raw = skb2->nh.raw; ++ skb2->pkt_type = PACKET_OUTGOING; ++ ptype->func(skb2, skb->dev, ptype); ++ } ++ } ++ rcu_read_unlock(); ++} ++ ++/* ++ * Invalidate hardware checksum when packet is to be mangled, and ++ * complete checksum manually on outgoing path. ++ */ ++int skb_checksum_help(struct sk_buff *skb, int inward) ++{ ++ unsigned int csum; ++ int ret = 0, offset = skb->h.raw - skb->data; ++ ++ if (inward) { ++ skb->ip_summed = CHECKSUM_NONE; ++ goto out; ++ } ++ ++ if (skb_cloned(skb)) { ++ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); ++ if (ret) ++ goto out; ++ } ++ ++ if (offset > (int)skb->len) ++ BUG(); ++ csum = skb_checksum(skb, offset, skb->len-offset, 0); ++ ++ offset = skb->tail - skb->h.raw; ++ if (offset <= 0) ++ BUG(); ++ if (skb->csum + 2 > offset) ++ BUG(); ++ ++ *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); ++ skb->ip_summed = CHECKSUM_NONE; ++out: ++ return ret; ++} ++ ++#ifdef CONFIG_HIGHMEM ++/* Actually, we should eliminate this check as soon as we know, that: ++ * 1. IOMMU is present and allows to map all the memory. ++ * 2. No high memory really exists on this machine. ++ */ ++ ++static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) ++{ ++ int i; ++ ++ if (dev->features & NETIF_F_HIGHDMA) ++ return 0; ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ if (PageHighMem(skb_shinfo(skb)->frags[i].page)) ++ return 1; ++ ++ return 0; ++} ++#else ++#define illegal_highdma(dev, skb) (0) ++#endif ++ ++extern void skb_release_data(struct sk_buff *); ++ ++/* Keep head the same: replace data */ ++int __skb_linearize(struct sk_buff *skb, int gfp_mask) ++{ ++ unsigned int size; ++ u8 *data; ++ long offset; ++ struct skb_shared_info *ninfo; ++ int headerlen = skb->data - skb->head; ++ int expand = (skb->tail + skb->data_len) - skb->end; ++ ++ if (skb_shared(skb)) ++ BUG(); ++ ++ if (expand <= 0) ++ expand = 0; ++ ++ size = skb->end - skb->head + expand; ++ size = SKB_DATA_ALIGN(size); ++ data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); ++ if (!data) ++ return -ENOMEM; ++ ++ /* Copy entire thing */ ++ if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) ++ BUG(); ++ ++ /* Set up shinfo */ ++ ninfo = (struct skb_shared_info*)(data + size); ++ atomic_set(&ninfo->dataref, 1); ++ ninfo->tso_size = skb_shinfo(skb)->tso_size; ++ ninfo->tso_segs = skb_shinfo(skb)->tso_segs; ++ ninfo->nr_frags = 0; ++ ninfo->frag_list = NULL; ++ ++ /* Offset between the two in bytes */ ++ offset = data - skb->head; ++ ++ /* Free old data. */ ++ skb_release_data(skb); ++ ++ skb->head = data; ++ skb->end = data + size; ++ ++ /* Set up new pointers */ ++ skb->h.raw += offset; ++ skb->nh.raw += offset; ++ skb->mac.raw += offset; ++ skb->tail += offset; ++ skb->data += offset; ++ ++ /* We are no longer a clone, even if we were. */ ++ skb->cloned = 0; ++ ++ skb->tail += skb->data_len; ++ skb->data_len = 0; ++ return 0; ++} ++ ++#define HARD_TX_LOCK(dev, cpu) { \ ++ if ((dev->features & NETIF_F_LLTX) == 0) { \ ++ spin_lock(&dev->xmit_lock); \ ++ dev->xmit_lock_owner = cpu; \ ++ } \ ++} ++ ++#define HARD_TX_UNLOCK(dev) { \ ++ if ((dev->features & NETIF_F_LLTX) == 0) { \ ++ dev->xmit_lock_owner = -1; \ ++ spin_unlock(&dev->xmit_lock); \ ++ } \ ++} ++ ++/** ++ * dev_queue_xmit - transmit a buffer ++ * @skb: buffer to transmit ++ * ++ * Queue a buffer for transmission to a network device. The caller must ++ * have set the device and priority and built the buffer before calling ++ * this function. The function can be called from an interrupt. ++ * ++ * A negative errno code is returned on a failure. A success does not ++ * guarantee the frame will be transmitted as it may be dropped due ++ * to congestion or traffic shaping. ++ * ++ * ----------------------------------------------------------------------------------- ++ * I notice this method can also return errors from the queue disciplines, ++ * including NET_XMIT_DROP, which is a positive value. So, errors can also ++ * be positive. ++ * ++ * Regardless of the return value, the skb is consumed, so it is currently ++ * difficult to retry a send to this method. (You can bump the ref count ++ * before sending to hold a reference for retry if you are careful.) ++ * ++ * When calling this method, interrupts MUST be enabled. This is because ++ * the BH enable code must have IRQs enabled so that it will not deadlock. ++ * --BLG ++ */ ++ ++int dev_queue_xmit(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ struct Qdisc *q; ++ int rc = -ENOMEM; ++ ++ if (skb_shinfo(skb)->frag_list && ++ !(dev->features & NETIF_F_FRAGLIST) && ++ __skb_linearize(skb, GFP_ATOMIC)) ++ goto out_kfree_skb; ++ ++ /* Fragmented skb is linearized if device does not support SG, ++ * or if at least one of fragments is in highmem and device ++ * does not support DMA from it. ++ */ ++ if (skb_shinfo(skb)->nr_frags && ++ (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && ++ __skb_linearize(skb, GFP_ATOMIC)) ++ goto out_kfree_skb; ++ ++ /* If packet is not checksummed and device does not support ++ * checksumming for this protocol, complete checksumming here. ++ */ ++ if (skb->ip_summed == CHECKSUM_HW && ++ (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && ++ (!(dev->features & NETIF_F_IP_CSUM) || ++ skb->protocol != htons(ETH_P_IP)))) ++ if (skb_checksum_help(skb, 0)) ++ goto out_kfree_skb; ++ ++ /* Disable soft irqs for various locks below. Also ++ * stops preemption for RCU. ++ */ ++ local_bh_disable(); ++ ++ /* Updates of qdisc are serialized by queue_lock. ++ * The struct Qdisc which is pointed to by qdisc is now a ++ * rcu structure - it may be accessed without acquiring ++ * a lock (but the structure may be stale.) The freeing of the ++ * qdisc will be deferred until it's known that there are no ++ * more references to it. ++ * ++ * If the qdisc has an enqueue function, we still need to ++ * hold the queue_lock before calling it, since queue_lock ++ * also serializes access to the device queue. ++ */ ++ ++ q = rcu_dereference(dev->qdisc); ++#ifdef CONFIG_NET_CLS_ACT ++ skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); ++#endif ++ if (q->enqueue) { ++ /* Grab device queue */ ++ spin_lock(&dev->queue_lock); ++ ++ rc = q->enqueue(skb, q); ++ ++ qdisc_run(dev); ++ ++ spin_unlock(&dev->queue_lock); ++ rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; ++ goto out; ++ } ++ ++ /* The device has no queue. Common case for software devices: ++ loopback, all the sorts of tunnels... ++ ++ Really, it is unlikely that xmit_lock protection is necessary here. ++ (f.e. loopback and IP tunnels are clean ignoring statistics ++ counters.) ++ However, it is possible, that they rely on protection ++ made by us here. ++ ++ Check this and shot the lock. It is not prone from deadlocks. ++ Either shot noqueue qdisc, it is even simpler 8) ++ */ ++ if (dev->flags & IFF_UP) { ++ int cpu = smp_processor_id(); /* ok because BHs are off */ ++ ++ if (dev->xmit_lock_owner != cpu) { ++ ++ HARD_TX_LOCK(dev, cpu); ++ ++ if (!netif_queue_stopped(dev)) { ++ if (netdev_nit) ++ dev_queue_xmit_nit(skb, dev); ++ ++ rc = 0; ++ if (!dev->hard_start_xmit(skb, dev)) { ++ HARD_TX_UNLOCK(dev); ++ goto out; ++ } ++ } ++ HARD_TX_UNLOCK(dev); ++ if (net_ratelimit()) ++ printk(KERN_CRIT "Virtual device %s asks to " ++ "queue packet!\n", dev->name); ++ } else { ++ /* Recursion is detected! It is possible, ++ * unfortunately */ ++ if (net_ratelimit()) ++ printk(KERN_CRIT "Dead loop on virtual device " ++ "%s, fix it urgently!\n", dev->name); ++ } ++ } ++ ++ rc = -ENETDOWN; ++ local_bh_enable(); ++ ++out_kfree_skb: ++ kfree_skb(skb); ++ return rc; ++out: ++ local_bh_enable(); ++ return rc; ++} ++ ++ ++/*======================================================================= ++ Receiver routines ++ =======================================================================*/ ++ ++int netdev_max_backlog = 300; ++int weight_p = 64; /* old backlog weight */ ++/* These numbers are selected based on intuition and some ++ * experimentatiom, if you have more scientific way of doing this ++ * please go ahead and fix things. ++ */ ++int no_cong_thresh = 10; ++int no_cong = 20; ++int lo_cong = 100; ++int mod_cong = 290; ++ ++DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; ++ ++ ++static void get_sample_stats(int cpu) ++{ ++#ifdef RAND_LIE ++ unsigned long rd; ++ int rq; ++#endif ++ struct softnet_data *sd = &per_cpu(softnet_data, cpu); ++ int blog = sd->input_pkt_queue.qlen; ++ int avg_blog = sd->avg_blog; ++ ++ avg_blog = (avg_blog >> 1) + (blog >> 1); ++ ++ if (avg_blog > mod_cong) { ++ /* Above moderate congestion levels. */ ++ sd->cng_level = NET_RX_CN_HIGH; ++#ifdef RAND_LIE ++ rd = net_random(); ++ rq = rd % netdev_max_backlog; ++ if (rq < avg_blog) /* unlucky bastard */ ++ sd->cng_level = NET_RX_DROP; ++#endif ++ } else if (avg_blog > lo_cong) { ++ sd->cng_level = NET_RX_CN_MOD; ++#ifdef RAND_LIE ++ rd = net_random(); ++ rq = rd % netdev_max_backlog; ++ if (rq < avg_blog) /* unlucky bastard */ ++ sd->cng_level = NET_RX_CN_HIGH; ++#endif ++ } else if (avg_blog > no_cong) ++ sd->cng_level = NET_RX_CN_LOW; ++ else /* no congestion */ ++ sd->cng_level = NET_RX_SUCCESS; ++ ++ sd->avg_blog = avg_blog; ++} ++ ++#ifdef OFFLINE_SAMPLE ++static void sample_queue(unsigned long dummy) ++{ ++/* 10 ms 0r 1ms -- i don't care -- JHS */ ++ int next_tick = 1; ++ int cpu = smp_processor_id(); ++ ++ get_sample_stats(cpu); ++ next_tick += jiffies; ++ mod_timer(&samp_timer, next_tick); ++} ++#endif ++ ++ ++/** ++ * netif_rx - post buffer to the network code ++ * @skb: buffer to post ++ * ++ * This function receives a packet from a device driver and queues it for ++ * the upper (protocol) levels to process. It always succeeds. The buffer ++ * may be dropped during processing for congestion control or by the ++ * protocol layers. ++ * ++ * return values: ++ * NET_RX_SUCCESS (no congestion) ++ * NET_RX_CN_LOW (low congestion) ++ * NET_RX_CN_MOD (moderate congestion) ++ * NET_RX_CN_HIGH (high congestion) ++ * NET_RX_DROP (packet was dropped) ++ * ++ */ ++ ++int netif_rx(struct sk_buff *skb) ++{ ++ int this_cpu; ++ struct softnet_data *queue; ++ unsigned long flags; ++ ++ /* if netpoll wants it, pretend we never saw it */ ++ if (netpoll_rx(skb)) ++ return NET_RX_DROP; ++ ++ if (!skb->stamp.tv_sec) ++ net_timestamp(&skb->stamp); ++ ++ /* ++ * The code is rearranged so that the path is the most ++ * short when CPU is congested, but is still operating. ++ */ ++ local_irq_save(flags); ++ this_cpu = smp_processor_id(); ++ queue = &__get_cpu_var(softnet_data); ++ ++ __get_cpu_var(netdev_rx_stat).total++; ++ if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { ++ if (queue->input_pkt_queue.qlen) { ++ if (queue->throttle) ++ goto drop; ++ ++enqueue: ++ dev_hold(skb->dev); ++ __skb_queue_tail(&queue->input_pkt_queue, skb); ++#ifndef OFFLINE_SAMPLE ++ get_sample_stats(this_cpu); ++#endif ++ local_irq_restore(flags); ++ return queue->cng_level; ++ } ++ ++ if (queue->throttle) ++ queue->throttle = 0; ++ ++ netif_rx_schedule(&queue->backlog_dev); ++ goto enqueue; ++ } ++ ++ if (!queue->throttle) { ++ queue->throttle = 1; ++ __get_cpu_var(netdev_rx_stat).throttled++; ++ } ++ ++drop: ++ __get_cpu_var(netdev_rx_stat).dropped++; ++ local_irq_restore(flags); ++ ++ kfree_skb(skb); ++ return NET_RX_DROP; ++} ++ ++int netif_rx_ni(struct sk_buff *skb) ++{ ++ int err; ++ ++ preempt_disable(); ++ err = netif_rx(skb); ++ if (local_softirq_pending()) ++ do_softirq(); ++ preempt_enable(); ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(netif_rx_ni); ++ ++static __inline__ void skb_bond(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ ++ if (dev->master) { ++ skb->real_dev = skb->dev; ++ skb->dev = dev->master; ++ } ++} ++ ++static void net_tx_action(struct softirq_action *h) ++{ ++ struct softnet_data *sd = &__get_cpu_var(softnet_data); ++ ++ if (sd->completion_queue) { ++ struct sk_buff *clist; ++ ++ local_irq_disable(); ++ clist = sd->completion_queue; ++ sd->completion_queue = NULL; ++ local_irq_enable(); ++ ++ while (clist) { ++ struct sk_buff *skb = clist; ++ clist = clist->next; ++ ++ BUG_TRAP(!atomic_read(&skb->users)); ++ __kfree_skb(skb); ++ } ++ } ++ ++ if (sd->output_queue) { ++ struct net_device *head; ++ ++ local_irq_disable(); ++ head = sd->output_queue; ++ sd->output_queue = NULL; ++ local_irq_enable(); ++ ++ while (head) { ++ struct net_device *dev = head; ++ head = head->next_sched; ++ ++ smp_mb__before_clear_bit(); ++ clear_bit(__LINK_STATE_SCHED, &dev->state); ++ ++ if (spin_trylock(&dev->queue_lock)) { ++ qdisc_run(dev); ++ spin_unlock(&dev->queue_lock); ++ } else { ++ netif_schedule(dev); ++ } ++ } ++ } ++} ++ ++static __inline__ int deliver_skb(struct sk_buff *skb, ++ struct packet_type *pt_prev) ++{ ++ atomic_inc(&skb->users); ++ return pt_prev->func(skb, skb->dev, pt_prev); ++} ++ ++#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) ++int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); ++struct net_bridge; ++struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, ++ unsigned char *addr); ++void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); ++ ++static __inline__ int handle_bridge(struct sk_buff **pskb, ++ struct packet_type **pt_prev, int *ret) ++{ ++ struct net_bridge_port *port; ++ ++ if ((*pskb)->pkt_type == PACKET_LOOPBACK || ++ (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) ++ return 0; ++ ++ if (*pt_prev) { ++ *ret = deliver_skb(*pskb, *pt_prev); ++ *pt_prev = NULL; ++ } ++ ++ return br_handle_frame_hook(port, pskb); ++} ++#else ++#define handle_bridge(skb, pt_prev, ret) (0) ++#endif ++ ++#ifdef CONFIG_NET_CLS_ACT ++/* TODO: Maybe we should just force sch_ingress to be compiled in ++ * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions ++ * a compare and 2 stores extra right now if we dont have it on ++ * but have CONFIG_NET_CLS_ACT ++ * NOTE: This doesnt stop any functionality; if you dont have ++ * the ingress scheduler, you just cant add policies on ingress. ++ * ++ */ ++static int ing_filter(struct sk_buff *skb) ++{ ++ struct Qdisc *q; ++ struct net_device *dev = skb->dev; ++ int result = TC_ACT_OK; ++ ++ if (dev->qdisc_ingress) { ++ __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); ++ if (MAX_RED_LOOP < ttl++) { ++ printk("Redir loop detected Dropping packet (%s->%s)\n", ++ skb->input_dev?skb->input_dev->name:"??",skb->dev->name); ++ return TC_ACT_SHOT; ++ } ++ ++ skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); ++ ++ skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); ++ if (NULL == skb->input_dev) { ++ skb->input_dev = skb->dev; ++ printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); ++ } ++ spin_lock(&dev->ingress_lock); ++ if ((q = dev->qdisc_ingress) != NULL) ++ result = q->enqueue(skb, q); ++ spin_unlock(&dev->ingress_lock); ++ ++ } ++ ++ return result; ++} ++#endif ++ ++int netif_receive_skb(struct sk_buff *skb) ++{ ++ struct packet_type *ptype, *pt_prev; ++ int ret = NET_RX_DROP; ++ unsigned short type; ++ ++ /* if we've gotten here through NAPI, check netpoll */ ++ if (skb->dev->poll && netpoll_rx(skb)) ++ return NET_RX_DROP; ++ ++ if (!skb->stamp.tv_sec) ++ net_timestamp(&skb->stamp); ++ ++ skb_bond(skb); ++ ++ __get_cpu_var(netdev_rx_stat).total++; ++ ++ skb->h.raw = skb->nh.raw = skb->data; ++ skb->mac_len = skb->nh.raw - skb->mac.raw; ++ ++ pt_prev = NULL; ++ ++ rcu_read_lock(); ++ ++#ifdef CONFIG_NET_CLS_ACT ++ if (skb->tc_verd & TC_NCLS) { ++ skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); ++ goto ncls; ++ } ++#endif ++ ++ list_for_each_entry_rcu(ptype, &ptype_all, list) { ++ if (!ptype->dev || ptype->dev == skb->dev) { ++ if (pt_prev) ++ ret = deliver_skb(skb, pt_prev); ++ pt_prev = ptype; ++ } ++ } ++ ++#ifdef CONFIG_NET_CLS_ACT ++ if (pt_prev) { ++ ret = deliver_skb(skb, pt_prev); ++ pt_prev = NULL; /* noone else should process this after*/ ++ } else { ++ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); ++ } ++ ++ ret = ing_filter(skb); ++ ++ if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { ++ kfree_skb(skb); ++ goto out; ++ } ++ ++ skb->tc_verd = 0; ++ncls: ++#endif ++ ++ handle_diverter(skb); ++ ++ if (handle_bridge(&skb, &pt_prev, &ret)) ++ goto out; ++ ++ type = skb->protocol; ++ list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { ++ if (ptype->type == type && ++ (!ptype->dev || ptype->dev == skb->dev)) { ++ if (pt_prev) ++ ret = deliver_skb(skb, pt_prev); ++ pt_prev = ptype; ++ } ++ } ++ ++ if (pt_prev) { ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } else { ++ kfree_skb(skb); ++ /* Jamal, now you will not able to escape explaining ++ * me how you were going to use this. :-) ++ */ ++ ret = NET_RX_DROP; ++ } ++ ++out: ++ rcu_read_unlock(); ++ return ret; ++} ++ ++static int process_backlog(struct net_device *backlog_dev, int *budget) ++{ ++ int work = 0; ++ int quota = min(backlog_dev->quota, *budget); ++ struct softnet_data *queue = &__get_cpu_var(softnet_data); ++ unsigned long start_time = jiffies; ++ ++ backlog_dev->weight = weight_p; ++ for (;;) { ++ struct sk_buff *skb; ++ struct net_device *dev; ++ ++ local_irq_disable(); ++ skb = __skb_dequeue(&queue->input_pkt_queue); ++ if (!skb) ++ goto job_done; ++ local_irq_enable(); ++ ++ dev = skb->dev; ++ ++ netif_receive_skb(skb); ++ ++ dev_put(dev); ++ ++ work++; ++ ++ if (work >= quota || jiffies - start_time > 1) ++ break; ++ ++ } ++ ++ backlog_dev->quota -= work; ++ *budget -= work; ++ return -1; ++ ++job_done: ++ backlog_dev->quota -= work; ++ *budget -= work; ++ ++ list_del(&backlog_dev->poll_list); ++ smp_mb__before_clear_bit(); ++ netif_poll_enable(backlog_dev); ++ ++ if (queue->throttle) ++ queue->throttle = 0; ++ local_irq_enable(); ++ return 0; ++} ++ ++static void net_rx_action(struct softirq_action *h) ++{ ++ struct softnet_data *queue = &__get_cpu_var(softnet_data); ++ unsigned long start_time = jiffies; ++ int budget = netdev_max_backlog; ++ ++ ++ local_irq_disable(); ++ ++ while (!list_empty(&queue->poll_list)) { ++ struct net_device *dev; ++ ++ if (budget <= 0 || jiffies - start_time > 1) ++ goto softnet_break; ++ ++ local_irq_enable(); ++ ++ dev = list_entry(queue->poll_list.next, ++ struct net_device, poll_list); ++ netpoll_poll_lock(dev); ++ ++ if (dev->quota <= 0 || dev->poll(dev, &budget)) { ++ netpoll_poll_unlock(dev); ++ local_irq_disable(); ++ list_del(&dev->poll_list); ++ list_add_tail(&dev->poll_list, &queue->poll_list); ++ if (dev->quota < 0) ++ dev->quota += dev->weight; ++ else ++ dev->quota = dev->weight; ++ } else { ++ netpoll_poll_unlock(dev); ++ dev_put(dev); ++ local_irq_disable(); ++ } ++ } ++out: ++ local_irq_enable(); ++ return; ++ ++softnet_break: ++ __get_cpu_var(netdev_rx_stat).time_squeeze++; ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ goto out; ++} ++ ++static gifconf_func_t * gifconf_list [NPROTO]; ++ ++/** ++ * register_gifconf - register a SIOCGIF handler ++ * @family: Address family ++ * @gifconf: Function handler ++ * ++ * Register protocol dependent address dumping routines. The handler ++ * that is passed must not be freed or reused until it has been replaced ++ * by another handler. ++ */ ++int register_gifconf(unsigned int family, gifconf_func_t * gifconf) ++{ ++ if (family >= NPROTO) ++ return -EINVAL; ++ gifconf_list[family] = gifconf; ++ return 0; ++} ++ ++ ++/* ++ * Map an interface index to its name (SIOCGIFNAME) ++ */ ++ ++/* ++ * We need this ioctl for efficient implementation of the ++ * if_indextoname() function required by the IPv6 API. Without ++ * it, we would have to search all the interfaces to find a ++ * match. --pb ++ */ ++ ++static int dev_ifname(struct ifreq __user *arg) ++{ ++ struct net_device *dev; ++ struct ifreq ifr; ++ ++ /* ++ * Fetch the caller's info block. ++ */ ++ ++ if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) ++ return -EFAULT; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_index(ifr.ifr_ifindex); ++ if (!dev) { ++ read_unlock(&dev_base_lock); ++ return -ENODEV; ++ } ++ ++ strcpy(ifr.ifr_name, dev->name); ++ read_unlock(&dev_base_lock); ++ ++ if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ return 0; ++} ++ ++/* ++ * Perform a SIOCGIFCONF call. This structure will change ++ * size eventually, and there is nothing I can do about it. ++ * Thus we will need a 'compatibility mode'. ++ */ ++ ++static int dev_ifconf(char __user *arg) ++{ ++ struct ifconf ifc; ++ struct net_device *dev; ++ char __user *pos; ++ int len; ++ int total; ++ int i; ++ ++ /* ++ * Fetch the caller's info block. ++ */ ++ ++ if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) ++ return -EFAULT; ++ ++ pos = ifc.ifc_buf; ++ len = ifc.ifc_len; ++ ++ /* ++ * Loop over the interfaces, and write an info block for each. ++ */ ++ ++ total = 0; ++ for (dev = dev_base; dev; dev = dev->next) { ++ for (i = 0; i < NPROTO; i++) { ++ if (gifconf_list[i]) { ++ int done; ++ if (!pos) ++ done = gifconf_list[i](dev, NULL, 0); ++ else ++ done = gifconf_list[i](dev, pos + total, ++ len - total); ++ if (done < 0) ++ return -EFAULT; ++ total += done; ++ } ++ } ++ } ++ ++ /* ++ * All done. Write the updated control block back to the caller. ++ */ ++ ifc.ifc_len = total; ++ ++ /* ++ * Both BSD and Solaris return 0 here, so we do too. ++ */ ++ return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; ++} ++ ++#ifdef CONFIG_PROC_FS ++/* ++ * This is invoked by the /proc filesystem handler to display a device ++ * in detail. ++ */ ++static __inline__ struct net_device *dev_get_idx(loff_t pos) ++{ ++ struct net_device *dev; ++ loff_t i; ++ ++ for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); ++ ++ return i == pos ? dev : NULL; ++} ++ ++void *dev_seq_start(struct seq_file *seq, loff_t *pos) ++{ ++ read_lock(&dev_base_lock); ++ return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; ++} ++ ++void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; ++} ++ ++void dev_seq_stop(struct seq_file *seq, void *v) ++{ ++ read_unlock(&dev_base_lock); ++} ++ ++static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) ++{ ++ if (dev->get_stats) { ++ struct net_device_stats *stats = dev->get_stats(dev); ++ ++ seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " ++ "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", ++ dev->name, stats->rx_bytes, stats->rx_packets, ++ stats->rx_errors, ++ stats->rx_dropped + stats->rx_missed_errors, ++ stats->rx_fifo_errors, ++ stats->rx_length_errors + stats->rx_over_errors + ++ stats->rx_crc_errors + stats->rx_frame_errors, ++ stats->rx_compressed, stats->multicast, ++ stats->tx_bytes, stats->tx_packets, ++ stats->tx_errors, stats->tx_dropped, ++ stats->tx_fifo_errors, stats->collisions, ++ stats->tx_carrier_errors + ++ stats->tx_aborted_errors + ++ stats->tx_window_errors + ++ stats->tx_heartbeat_errors, ++ stats->tx_compressed); ++ } else ++ seq_printf(seq, "%6s: No statistics available.\n", dev->name); ++} ++ ++/* ++ * Called from the PROCfs module. This now uses the new arbitrary sized ++ * /proc/net interface to create /proc/net/dev ++ */ ++static int dev_seq_show(struct seq_file *seq, void *v) ++{ ++ if (v == SEQ_START_TOKEN) ++ seq_puts(seq, "Inter-| Receive " ++ " | Transmit\n" ++ " face |bytes packets errs drop fifo frame " ++ "compressed multicast|bytes packets errs " ++ "drop fifo colls carrier compressed\n"); ++ else ++ dev_seq_printf_stats(seq, v); ++ return 0; ++} ++ ++static struct netif_rx_stats *softnet_get_online(loff_t *pos) ++{ ++ struct netif_rx_stats *rc = NULL; ++ ++ while (*pos < NR_CPUS) ++ if (cpu_online(*pos)) { ++ rc = &per_cpu(netdev_rx_stat, *pos); ++ break; ++ } else ++ ++*pos; ++ return rc; ++} ++ ++static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) ++{ ++ return softnet_get_online(pos); ++} ++ ++static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return softnet_get_online(pos); ++} ++ ++static void softnet_seq_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static int softnet_seq_show(struct seq_file *seq, void *v) ++{ ++ struct netif_rx_stats *s = v; ++ ++ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", ++ s->total, s->dropped, s->time_squeeze, s->throttled, ++ s->fastroute_hit, s->fastroute_success, s->fastroute_defer, ++ s->fastroute_deferred_out, ++#if 0 ++ s->fastroute_latency_reduction ++#else ++ s->cpu_collision ++#endif ++ ); ++ return 0; ++} ++ ++static struct seq_operations dev_seq_ops = { ++ .start = dev_seq_start, ++ .next = dev_seq_next, ++ .stop = dev_seq_stop, ++ .show = dev_seq_show, ++}; ++ ++static int dev_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &dev_seq_ops); ++} ++ ++static struct file_operations dev_seq_fops = { ++ .owner = THIS_MODULE, ++ .open = dev_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static struct seq_operations softnet_seq_ops = { ++ .start = softnet_seq_start, ++ .next = softnet_seq_next, ++ .stop = softnet_seq_stop, ++ .show = softnet_seq_show, ++}; ++ ++static int softnet_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &softnet_seq_ops); ++} ++ ++static struct file_operations softnet_seq_fops = { ++ .owner = THIS_MODULE, ++ .open = softnet_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++#ifdef WIRELESS_EXT ++extern int wireless_proc_init(void); ++#else ++#define wireless_proc_init() 0 ++#endif ++ ++static int __init dev_proc_init(void) ++{ ++ int rc = -ENOMEM; ++ ++ if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) ++ goto out; ++ if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) ++ goto out_dev; ++ if (wireless_proc_init()) ++ goto out_softnet; ++ rc = 0; ++out: ++ return rc; ++out_softnet: ++ proc_net_remove("softnet_stat"); ++out_dev: ++ proc_net_remove("dev"); ++ goto out; ++} ++#else ++#define dev_proc_init() 0 ++#endif /* CONFIG_PROC_FS */ ++ ++ ++/** ++ * netdev_set_master - set up master/slave pair ++ * @slave: slave device ++ * @master: new master device ++ * ++ * Changes the master device of the slave. Pass %NULL to break the ++ * bonding. The caller must hold the RTNL semaphore. On a failure ++ * a negative errno code is returned. On success the reference counts ++ * are adjusted, %RTM_NEWLINK is sent to the routing socket and the ++ * function returns zero. ++ */ ++int netdev_set_master(struct net_device *slave, struct net_device *master) ++{ ++ struct net_device *old = slave->master; ++ ++ ASSERT_RTNL(); ++ ++ if (master) { ++ if (old) ++ return -EBUSY; ++ dev_hold(master); ++ } ++ ++ slave->master = master; ++ ++ synchronize_net(); ++ ++ if (old) ++ dev_put(old); ++ ++ if (master) ++ slave->flags |= IFF_SLAVE; ++ else ++ slave->flags &= ~IFF_SLAVE; ++ ++ rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); ++ return 0; ++} ++ ++/** ++ * dev_set_promiscuity - update promiscuity count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove promsicuity from a device. While the count in the device ++ * remains above zero the interface remains promiscuous. Once it hits zero ++ * the device reverts back to normal filtering operation. A negative inc ++ * value is used to drop promiscuity on the device. ++ */ ++void dev_set_promiscuity(struct net_device *dev, int inc) ++{ ++ unsigned short old_flags = dev->flags; ++ ++ dev->flags |= IFF_PROMISC; ++ if ((dev->promiscuity += inc) == 0) ++ dev->flags &= ~IFF_PROMISC; ++ if (dev->flags ^ old_flags) { ++ dev_mc_upload(dev); ++ printk(KERN_INFO "device %s %s promiscuous mode\n", ++ dev->name, (dev->flags & IFF_PROMISC) ? "entered" : ++ "left"); ++ } ++} ++ ++/** ++ * dev_set_allmulti - update allmulti count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove reception of all multicast frames to a device. While the ++ * count in the device remains above zero the interface remains listening ++ * to all interfaces. Once it hits zero the device reverts back to normal ++ * filtering operation. A negative @inc value is used to drop the counter ++ * when releasing a resource needing all multicasts. ++ */ ++ ++void dev_set_allmulti(struct net_device *dev, int inc) ++{ ++ unsigned short old_flags = dev->flags; ++ ++ dev->flags |= IFF_ALLMULTI; ++ if ((dev->allmulti += inc) == 0) ++ dev->flags &= ~IFF_ALLMULTI; ++ if (dev->flags ^ old_flags) ++ dev_mc_upload(dev); ++} ++ ++unsigned dev_get_flags(const struct net_device *dev) ++{ ++ unsigned flags; ++ ++ flags = (dev->flags & ~(IFF_PROMISC | ++ IFF_ALLMULTI | ++ IFF_RUNNING)) | ++ (dev->gflags & (IFF_PROMISC | ++ IFF_ALLMULTI)); ++ ++ if (netif_running(dev) && netif_carrier_ok(dev)) ++ flags |= IFF_RUNNING; ++ ++ return flags; ++} ++ ++int dev_change_flags(struct net_device *dev, unsigned flags) ++{ ++ int ret; ++ int old_flags = dev->flags; ++ ++ /* ++ * Set the flags on our device. ++ */ ++ ++ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | ++ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | ++ IFF_AUTOMEDIA)) | ++ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | ++ IFF_ALLMULTI)); ++ ++ /* ++ * Load in the correct multicast list now the flags have changed. ++ */ ++ ++ dev_mc_upload(dev); ++ ++ /* ++ * Have we downed the interface. We handle IFF_UP ourselves ++ * according to user attempts to set it, rather than blindly ++ * setting it. ++ */ ++ ++ ret = 0; ++ if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ ++ ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); ++ ++ if (!ret) ++ dev_mc_upload(dev); ++ } ++ ++ if (dev->flags & IFF_UP && ++ ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | ++ IFF_VOLATILE))) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); ++ ++ if ((flags ^ dev->gflags) & IFF_PROMISC) { ++ int inc = (flags & IFF_PROMISC) ? +1 : -1; ++ dev->gflags ^= IFF_PROMISC; ++ dev_set_promiscuity(dev, inc); ++ } ++ ++ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI ++ is important. Some (broken) drivers set IFF_PROMISC, when ++ IFF_ALLMULTI is requested not asking us and not reporting. ++ */ ++ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { ++ int inc = (flags & IFF_ALLMULTI) ? +1 : -1; ++ dev->gflags ^= IFF_ALLMULTI; ++ dev_set_allmulti(dev, inc); ++ } ++ ++ if (old_flags ^ dev->flags) ++ rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags); ++ ++ return ret; ++} ++ ++int dev_set_mtu(struct net_device *dev, int new_mtu) ++{ ++ int err; ++ ++ if (new_mtu == dev->mtu) ++ return 0; ++ ++ /* MTU must be positive. */ ++ if (new_mtu < 0) ++ return -EINVAL; ++ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ err = 0; ++ if (dev->change_mtu) ++ err = dev->change_mtu(dev, new_mtu); ++ else ++ dev->mtu = new_mtu; ++ if (!err && dev->flags & IFF_UP) ++ notifier_call_chain(&netdev_chain, ++ NETDEV_CHANGEMTU, dev); ++ return err; ++} ++ ++int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) ++{ ++ int err; ++ ++ if (!dev->set_mac_address) ++ return -EOPNOTSUPP; ++ if (sa->sa_family != dev->type) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ err = dev->set_mac_address(dev, sa); ++ if (!err) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); ++ return err; ++} ++ ++/* ++ * Perform the SIOCxIFxxx calls. ++ */ ++static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ++{ ++ int err; ++ struct net_device *dev = __dev_get_by_name(ifr->ifr_name); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ switch (cmd) { ++ case SIOCGIFFLAGS: /* Get interface flags */ ++ ifr->ifr_flags = dev_get_flags(dev); ++ return 0; ++ ++ case SIOCSIFFLAGS: /* Set interface flags */ ++ return dev_change_flags(dev, ifr->ifr_flags); ++ ++ case SIOCGIFMETRIC: /* Get the metric on the interface ++ (currently unused) */ ++ ifr->ifr_metric = 0; ++ return 0; ++ ++ case SIOCSIFMETRIC: /* Set the metric on the interface ++ (currently unused) */ ++ return -EOPNOTSUPP; ++ ++ case SIOCGIFMTU: /* Get the MTU of a device */ ++ ifr->ifr_mtu = dev->mtu; ++ return 0; ++ ++ case SIOCSIFMTU: /* Set the MTU of a device */ ++ return dev_set_mtu(dev, ifr->ifr_mtu); ++ ++ case SIOCGIFHWADDR: ++ if (!dev->addr_len) ++ memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); ++ else ++ memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, ++ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); ++ ifr->ifr_hwaddr.sa_family = dev->type; ++ return 0; ++ ++ case SIOCSIFHWADDR: ++ return dev_set_mac_address(dev, &ifr->ifr_hwaddr); ++ ++ case SIOCSIFHWBROADCAST: ++ if (ifr->ifr_hwaddr.sa_family != dev->type) ++ return -EINVAL; ++ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, ++ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); ++ notifier_call_chain(&netdev_chain, ++ NETDEV_CHANGEADDR, dev); ++ return 0; ++ ++ case SIOCGIFMAP: ++ ifr->ifr_map.mem_start = dev->mem_start; ++ ifr->ifr_map.mem_end = dev->mem_end; ++ ifr->ifr_map.base_addr = dev->base_addr; ++ ifr->ifr_map.irq = dev->irq; ++ ifr->ifr_map.dma = dev->dma; ++ ifr->ifr_map.port = dev->if_port; ++ return 0; ++ ++ case SIOCSIFMAP: ++ if (dev->set_config) { ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev->set_config(dev, &ifr->ifr_map); ++ } ++ return -EOPNOTSUPP; ++ ++ case SIOCADDMULTI: ++ if (!dev->set_multicast_list || ++ ifr->ifr_hwaddr.sa_family != AF_UNSPEC) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, ++ dev->addr_len, 1); ++ ++ case SIOCDELMULTI: ++ if (!dev->set_multicast_list || ++ ifr->ifr_hwaddr.sa_family != AF_UNSPEC) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, ++ dev->addr_len, 1); ++ ++ case SIOCGIFINDEX: ++ ifr->ifr_ifindex = dev->ifindex; ++ return 0; ++ ++ case SIOCGIFTXQLEN: ++ ifr->ifr_qlen = dev->tx_queue_len; ++ return 0; ++ ++ case SIOCSIFTXQLEN: ++ if (ifr->ifr_qlen < 0) ++ return -EINVAL; ++ dev->tx_queue_len = ifr->ifr_qlen; ++ return 0; ++ ++ case SIOCSIFNAME: ++ ifr->ifr_newname[IFNAMSIZ-1] = '\0'; ++ return dev_change_name(dev, ifr->ifr_newname); ++ ++ /* ++ * Unknown or private ioctl ++ */ ++ ++ default: ++ if ((cmd >= SIOCDEVPRIVATE && ++ cmd <= SIOCDEVPRIVATE + 15) || ++ cmd == SIOCBONDENSLAVE || ++ cmd == SIOCBONDRELEASE || ++ cmd == SIOCBONDSETHWADDR || ++ cmd == SIOCBONDSLAVEINFOQUERY || ++ cmd == SIOCBONDINFOQUERY || ++ cmd == SIOCBONDCHANGEACTIVE || ++ cmd == SIOCGMIIPHY || ++ cmd == SIOCGMIIREG || ++ cmd == SIOCSMIIREG || ++ cmd == SIOCBRADDIF || ++ cmd == SIOCBRDELIF || ++ cmd == SIOCWANDEV) { ++ err = -EOPNOTSUPP; ++ if (dev->do_ioctl) { ++ if (netif_device_present(dev)) ++ err = dev->do_ioctl(dev, ifr, ++ cmd); ++ else ++ err = -ENODEV; ++ } ++ } else ++ err = -EINVAL; ++ ++ } ++ return err; ++} ++ ++/* ++ * This function handles all "interface"-type I/O control requests. The actual ++ * 'doing' part of this is dev_ifsioc above. ++ */ ++ ++/** ++ * dev_ioctl - network device ioctl ++ * @cmd: command to issue ++ * @arg: pointer to a struct ifreq in user space ++ * ++ * Issue ioctl functions to devices. This is normally called by the ++ * user space syscall interfaces but can sometimes be useful for ++ * other purposes. The return value is the return from the syscall if ++ * positive or a negative errno code on error. ++ */ ++ ++int dev_ioctl(unsigned int cmd, void __user *arg) ++{ ++ struct ifreq ifr; ++ int ret; ++ char *colon; ++ ++ /* One special case: SIOCGIFCONF takes ifconf argument ++ and requires shared lock, because it sleeps writing ++ to user space. ++ */ ++ ++ if (cmd == SIOCGIFCONF) { ++ rtnl_shlock(); ++ ret = dev_ifconf((char __user *) arg); ++ rtnl_shunlock(); ++ return ret; ++ } ++ if (cmd == SIOCGIFNAME) ++ return dev_ifname((struct ifreq __user *)arg); ++ ++ if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) ++ return -EFAULT; ++ ++ ifr.ifr_name[IFNAMSIZ-1] = 0; ++ ++ colon = strchr(ifr.ifr_name, ':'); ++ if (colon) ++ *colon = 0; ++ ++ /* ++ * See which interface the caller is talking about. ++ */ ++ ++ switch (cmd) { ++ /* ++ * These ioctl calls: ++ * - can be done by all. ++ * - atomic and do not require locking. ++ * - return a value ++ */ ++ case SIOCGIFFLAGS: ++ case SIOCGIFMETRIC: ++ case SIOCGIFMTU: ++ case SIOCGIFHWADDR: ++ case SIOCGIFSLAVE: ++ case SIOCGIFMAP: ++ case SIOCGIFINDEX: ++ case SIOCGIFTXQLEN: ++ dev_load(ifr.ifr_name); ++ read_lock(&dev_base_lock); ++ ret = dev_ifsioc(&ifr, cmd); ++ read_unlock(&dev_base_lock); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ } ++ return ret; ++ ++ case SIOCETHTOOL: ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ethtool(&ifr); ++ rtnl_unlock(); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ } ++ return ret; ++ ++ /* ++ * These ioctl calls: ++ * - require superuser power. ++ * - require strict serialization. ++ * - return a value ++ */ ++ case SIOCGMIIPHY: ++ case SIOCGMIIREG: ++ case SIOCSIFNAME: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ } ++ return ret; ++ ++ /* ++ * These ioctl calls: ++ * - require superuser power. ++ * - require strict serialization. ++ * - do not return a value ++ */ ++ case SIOCSIFFLAGS: ++ case SIOCSIFMETRIC: ++ case SIOCSIFMTU: ++ case SIOCSIFMAP: ++ case SIOCSIFHWADDR: ++ case SIOCSIFSLAVE: ++ case SIOCADDMULTI: ++ case SIOCDELMULTI: ++ case SIOCSIFHWBROADCAST: ++ case SIOCSIFTXQLEN: ++ case SIOCSMIIREG: ++ case SIOCBONDENSLAVE: ++ case SIOCBONDRELEASE: ++ case SIOCBONDSETHWADDR: ++ case SIOCBONDSLAVEINFOQUERY: ++ case SIOCBONDINFOQUERY: ++ case SIOCBONDCHANGEACTIVE: ++ case SIOCBRADDIF: ++ case SIOCBRDELIF: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ return ret; ++ ++ case SIOCGIFMEM: ++ /* Get the per device memory space. We can add this but ++ * currently do not support it */ ++ case SIOCSIFMEM: ++ /* Set the per device memory buffer space. ++ * Not applicable in our case */ ++ case SIOCSIFLINK: ++ return -EINVAL; ++ ++ /* ++ * Unknown or private ioctl. ++ */ ++ default: ++ if (cmd == SIOCWANDEV || ++ (cmd >= SIOCDEVPRIVATE && ++ cmd <= SIOCDEVPRIVATE + 15)) { ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ if (!ret && copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ return ret; ++ } ++#ifdef WIRELESS_EXT ++ /* Take care of Wireless Extensions */ ++ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { ++ /* If command is `set a parameter', or ++ * `get the encoding parameters', check if ++ * the user has the right to do it */ ++ if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) { ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ } ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ /* Follow me in net/core/wireless.c */ ++ ret = wireless_process_ioctl(&ifr, cmd); ++ rtnl_unlock(); ++ if (IW_IS_GET(cmd) && ++ copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ return ret; ++ } ++#endif /* WIRELESS_EXT */ ++ return -EINVAL; ++ } ++} ++ ++ ++/** ++ * dev_new_index - allocate an ifindex ++ * ++ * Returns a suitable unique value for a new device interface ++ * number. The caller must hold the rtnl semaphore or the ++ * dev_base_lock to be sure it remains unique. ++ */ ++static int dev_new_index(void) ++{ ++ static int ifindex; ++ for (;;) { ++ if (++ifindex <= 0) ++ ifindex = 1; ++ if (!__dev_get_by_index(ifindex)) ++ return ifindex; ++ } ++} ++ ++static int dev_boot_phase = 1; ++ ++/* Delayed registration/unregisteration */ ++static DEFINE_SPINLOCK(net_todo_list_lock); ++static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); ++ ++static inline void net_set_todo(struct net_device *dev) ++{ ++ spin_lock(&net_todo_list_lock); ++ list_add_tail(&dev->todo_list, &net_todo_list); ++ spin_unlock(&net_todo_list_lock); ++} ++ ++/** ++ * register_netdevice - register a network device ++ * @dev: device to register ++ * ++ * Take a completed network device structure and add it to the kernel ++ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier ++ * chain. 0 is returned on success. A negative errno code is returned ++ * on a failure to set up the device, or if the name is a duplicate. ++ * ++ * Callers must hold the rtnl semaphore. You may want ++ * register_netdev() instead of this. ++ * ++ * BUGS: ++ * The locking appears insufficient to guarantee two parallel registers ++ * will not get the same name. ++ */ ++ ++int register_netdevice(struct net_device *dev) ++{ ++ struct hlist_head *head; ++ struct hlist_node *p; ++ int ret; ++ ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ /* When net_device's are persistent, this will be fatal. */ ++ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); ++ ++ spin_lock_init(&dev->queue_lock); ++ spin_lock_init(&dev->xmit_lock); ++ dev->xmit_lock_owner = -1; ++#ifdef CONFIG_NET_CLS_ACT ++ spin_lock_init(&dev->ingress_lock); ++#endif ++ ++ ret = alloc_divert_blk(dev); ++ if (ret) ++ goto out; ++ ++ dev->iflink = -1; ++ ++ /* Init, if this function is available */ ++ if (dev->init) { ++ ret = dev->init(dev); ++ if (ret) { ++ if (ret > 0) ++ ret = -EIO; ++ goto out_err; ++ } ++ } ++ ++ if (!dev_valid_name(dev->name)) { ++ ret = -EINVAL; ++ goto out_err; ++ } ++ ++ dev->ifindex = dev_new_index(); ++ if (dev->iflink == -1) ++ dev->iflink = dev->ifindex; ++ ++ /* Check for existence of name */ ++ head = dev_name_hash(dev->name); ++ hlist_for_each(p, head) { ++ struct net_device *d ++ = hlist_entry(p, struct net_device, name_hlist); ++ if (!strncmp(d->name, dev->name, IFNAMSIZ)) { ++ ret = -EEXIST; ++ goto out_err; ++ } ++ } ++ ++ /* Fix illegal SG+CSUM combinations. */ ++ if ((dev->features & NETIF_F_SG) && ++ !(dev->features & (NETIF_F_IP_CSUM | ++ NETIF_F_NO_CSUM | ++ NETIF_F_HW_CSUM))) { ++ printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", ++ dev->name); ++ dev->features &= ~NETIF_F_SG; ++ } ++ ++ /* TSO requires that SG is present as well. */ ++ if ((dev->features & NETIF_F_TSO) && ++ !(dev->features & NETIF_F_SG)) { ++ printk("%s: Dropping NETIF_F_TSO since no SG feature.\n", ++ dev->name); ++ dev->features &= ~NETIF_F_TSO; ++ } ++ ++ /* ++ * nil rebuild_header routine, ++ * that should be never called and used as just bug trap. ++ */ ++ ++ if (!dev->rebuild_header) ++ dev->rebuild_header = default_rebuild_header; ++ ++ /* ++ * Default initial state at registry is that the ++ * device is present. ++ */ ++ ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ ++ dev->next = NULL; ++ dev_init_scheduler(dev); ++ write_lock_bh(&dev_base_lock); ++ *dev_tail = dev; ++ dev_tail = &dev->next; ++ hlist_add_head(&dev->name_hlist, head); ++ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); ++ dev_hold(dev); ++ dev->reg_state = NETREG_REGISTERING; ++ write_unlock_bh(&dev_base_lock); ++ ++ /* Notify protocols, that a new device appeared. */ ++ notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); ++ ++ /* Finish registration after unlock */ ++ net_set_todo(dev); ++ ret = 0; ++ ++out: ++ return ret; ++out_err: ++ free_divert_blk(dev); ++ goto out; ++} ++ ++/** ++ * register_netdev - register a network device ++ * @dev: device to register ++ * ++ * Take a completed network device structure and add it to the kernel ++ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier ++ * chain. 0 is returned on success. A negative errno code is returned ++ * on a failure to set up the device, or if the name is a duplicate. ++ * ++ * This is a wrapper around register_netdev that takes the rtnl semaphore ++ * and expands the device name if you passed a format string to ++ * alloc_netdev. ++ */ ++int register_netdev(struct net_device *dev) ++{ ++ int err; ++ ++ rtnl_lock(); ++ ++ /* ++ * If the name is a format string the caller wants us to do a ++ * name allocation. ++ */ ++ if (strchr(dev->name, '%')) { ++ err = dev_alloc_name(dev, dev->name); ++ if (err < 0) ++ goto out; ++ } ++ ++ /* ++ * Back compatibility hook. Kill this one in 2.5 ++ */ ++ if (dev->name[0] == 0 || dev->name[0] == ' ') { ++ err = dev_alloc_name(dev, "eth%d"); ++ if (err < 0) ++ goto out; ++ } ++ ++ err = register_netdevice(dev); ++out: ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(register_netdev); ++ ++/* ++ * netdev_wait_allrefs - wait until all references are gone. ++ * ++ * This is called when unregistering network devices. ++ * ++ * Any protocol or device that holds a reference should register ++ * for netdevice notification, and cleanup and put back the ++ * reference if they receive an UNREGISTER event. ++ * We can get stuck here if buggy protocols don't correctly ++ * call dev_put. ++ */ ++static void netdev_wait_allrefs(struct net_device *dev) ++{ ++ unsigned long rebroadcast_time, warning_time; ++ ++ rebroadcast_time = warning_time = jiffies; ++ while (atomic_read(&dev->refcnt) != 0) { ++ if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { ++ rtnl_shlock(); ++ ++ /* Rebroadcast unregister notification */ ++ notifier_call_chain(&netdev_chain, ++ NETDEV_UNREGISTER, dev); ++ ++ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, ++ &dev->state)) { ++ /* We must not have linkwatch events ++ * pending on unregister. If this ++ * happens, we simply run the queue ++ * unscheduled, resulting in a noop ++ * for this device. ++ */ ++ linkwatch_run_queue(); ++ } ++ ++ rtnl_shunlock(); ++ ++ rebroadcast_time = jiffies; ++ } ++ ++ msleep(250); ++ ++ if (time_after(jiffies, warning_time + 10 * HZ)) { ++ printk(KERN_EMERG "unregister_netdevice: " ++ "waiting for %s to become free. Usage " ++ "count = %d\n", ++ dev->name, atomic_read(&dev->refcnt)); ++ warning_time = jiffies; ++ } ++ } ++} ++ ++/* The sequence is: ++ * ++ * rtnl_lock(); ++ * ... ++ * register_netdevice(x1); ++ * register_netdevice(x2); ++ * ... ++ * unregister_netdevice(y1); ++ * unregister_netdevice(y2); ++ * ... ++ * rtnl_unlock(); ++ * free_netdev(y1); ++ * free_netdev(y2); ++ * ++ * We are invoked by rtnl_unlock() after it drops the semaphore. ++ * This allows us to deal with problems: ++ * 1) We can create/delete sysfs objects which invoke hotplug ++ * without deadlocking with linkwatch via keventd. ++ * 2) Since we run with the RTNL semaphore not held, we can sleep ++ * safely in order to wait for the netdev refcnt to drop to zero. ++ */ ++static DECLARE_MUTEX(net_todo_run_mutex); ++void netdev_run_todo(void) ++{ ++ struct list_head list = LIST_HEAD_INIT(list); ++ int err; ++ ++ ++ /* Need to guard against multiple cpu's getting out of order. */ ++ down(&net_todo_run_mutex); ++ ++ /* Not safe to do outside the semaphore. We must not return ++ * until all unregister events invoked by the local processor ++ * have been completed (either by this todo run, or one on ++ * another cpu). ++ */ ++ if (list_empty(&net_todo_list)) ++ goto out; ++ ++ /* Snapshot list, allow later requests */ ++ spin_lock(&net_todo_list_lock); ++ list_splice_init(&net_todo_list, &list); ++ spin_unlock(&net_todo_list_lock); ++ ++ while (!list_empty(&list)) { ++ struct net_device *dev ++ = list_entry(list.next, struct net_device, todo_list); ++ list_del(&dev->todo_list); ++ ++ switch(dev->reg_state) { ++ case NETREG_REGISTERING: ++ err = netdev_register_sysfs(dev); ++ if (err) ++ printk(KERN_ERR "%s: failed sysfs registration (%d)\n", ++ dev->name, err); ++ dev->reg_state = NETREG_REGISTERED; ++ break; ++ ++ case NETREG_UNREGISTERING: ++ netdev_unregister_sysfs(dev); ++ dev->reg_state = NETREG_UNREGISTERED; ++ ++ netdev_wait_allrefs(dev); ++ ++ /* paranoia */ ++ BUG_ON(atomic_read(&dev->refcnt)); ++ BUG_TRAP(!dev->ip_ptr); ++ BUG_TRAP(!dev->ip6_ptr); ++ BUG_TRAP(!dev->dn_ptr); ++ ++ ++ /* It must be the very last action, ++ * after this 'dev' may point to freed up memory. ++ */ ++ if (dev->destructor) ++ dev->destructor(dev); ++ break; ++ ++ default: ++ printk(KERN_ERR "network todo '%s' but state %d\n", ++ dev->name, dev->reg_state); ++ break; ++ } ++ } ++ ++out: ++ up(&net_todo_run_mutex); ++} ++ ++/** ++ * alloc_netdev - allocate network device ++ * @sizeof_priv: size of private data to allocate space for ++ * @name: device name format string ++ * @setup: callback to initialize device ++ * ++ * Allocates a struct net_device with private data area for driver use ++ * and performs basic initialization. ++ */ ++struct net_device *alloc_netdev(int sizeof_priv, const char *name, ++ void (*setup)(struct net_device *)) ++{ ++ void *p; ++ struct net_device *dev; ++ int alloc_size; ++ ++ /* ensure 32-byte alignment of both the device and private area */ ++ alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; ++ alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; ++ ++ p = kmalloc(alloc_size, GFP_KERNEL); ++ if (!p) { ++ printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); ++ return NULL; ++ } ++ memset(p, 0, alloc_size); ++ ++ dev = (struct net_device *) ++ (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); ++ dev->padded = (char *)dev - (char *)p; ++ ++ if (sizeof_priv) ++ dev->priv = netdev_priv(dev); ++ ++ setup(dev); ++ strcpy(dev->name, name); ++ return dev; ++} ++EXPORT_SYMBOL(alloc_netdev); ++ ++/** ++ * free_netdev - free network device ++ * @dev: device ++ * ++ * This function does the last stage of destroying an allocated device ++ * interface. The reference to the device object is released. ++ * If this is the last reference then it will be freed. ++ */ ++void free_netdev(struct net_device *dev) ++{ ++#ifdef CONFIG_SYSFS ++ /* Compatiablity with error handling in drivers */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ kfree((char *)dev - dev->padded); ++ return; ++ } ++ ++ BUG_ON(dev->reg_state != NETREG_UNREGISTERED); ++ dev->reg_state = NETREG_RELEASED; ++ ++ /* will free via class release */ ++ class_device_put(&dev->class_dev); ++#else ++ kfree((char *)dev - dev->padded); ++#endif ++} ++ ++/* Synchronize with packet receive processing. */ ++void synchronize_net(void) ++{ ++ might_sleep(); ++ synchronize_rcu(); ++} ++ ++/** ++ * unregister_netdevice - remove device from the kernel ++ * @dev: device ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. On success 0 is returned, on a failure ++ * a negative errno code is returned. ++ * ++ * Callers must hold the rtnl semaphore. You may want ++ * unregister_netdev() instead of this. ++ */ ++ ++int unregister_netdevice(struct net_device *dev) ++{ ++ struct net_device *d, **dp; ++ ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ /* Some devices call without registering for initialization unwind. */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " ++ "was registered\n", dev->name, dev); ++ return -ENODEV; ++ } ++ ++ BUG_ON(dev->reg_state != NETREG_REGISTERED); ++ ++ /* If device is running, close it first. */ ++ if (dev->flags & IFF_UP) ++ dev_close(dev); ++ ++ /* And unlink it from device chain. */ ++ for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) { ++ if (d == dev) { ++ write_lock_bh(&dev_base_lock); ++ hlist_del(&dev->name_hlist); ++ hlist_del(&dev->index_hlist); ++ if (dev_tail == &dev->next) ++ dev_tail = dp; ++ *dp = d->next; ++ write_unlock_bh(&dev_base_lock); ++ break; ++ } ++ } ++ if (!d) { ++ printk(KERN_ERR "unregister net_device: '%s' not found\n", ++ dev->name); ++ return -ENODEV; ++ } ++ ++ dev->reg_state = NETREG_UNREGISTERING; ++ ++ synchronize_net(); ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ ++ /* Notify protocols, that we are about to destroy ++ this device. They should clean all the things. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); ++ ++ /* ++ * Flush the multicast chain ++ */ ++ dev_mc_discard(dev); ++ ++ if (dev->uninit) ++ dev->uninit(dev); ++ ++ /* Notifier chain MUST detach us from master device. */ ++ BUG_TRAP(!dev->master); ++ ++ free_divert_blk(dev); ++ ++ /* Finish processing unregister after unlock */ ++ net_set_todo(dev); ++ ++ synchronize_net(); ++ ++ dev_put(dev); ++ return 0; ++} ++ ++/** ++ * unregister_netdev - remove device from the kernel ++ * @dev: device ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. On success 0 is returned, on a failure ++ * a negative errno code is returned. ++ * ++ * This is just a wrapper for unregister_netdevice that takes ++ * the rtnl semaphore. In general you want to use this and not ++ * unregister_netdevice. ++ */ ++void unregister_netdev(struct net_device *dev) ++{ ++ rtnl_lock(); ++ unregister_netdevice(dev); ++ rtnl_unlock(); ++} ++ ++EXPORT_SYMBOL(unregister_netdev); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static int dev_cpu_callback(struct notifier_block *nfb, ++ unsigned long action, ++ void *ocpu) ++{ ++ struct sk_buff **list_skb; ++ struct net_device **list_net; ++ struct sk_buff *skb; ++ unsigned int cpu, oldcpu = (unsigned long)ocpu; ++ struct softnet_data *sd, *oldsd; ++ ++ if (action != CPU_DEAD) ++ return NOTIFY_OK; ++ ++ local_irq_disable(); ++ cpu = smp_processor_id(); ++ sd = &per_cpu(softnet_data, cpu); ++ oldsd = &per_cpu(softnet_data, oldcpu); ++ ++ /* Find end of our completion_queue. */ ++ list_skb = &sd->completion_queue; ++ while (*list_skb) ++ list_skb = &(*list_skb)->next; ++ /* Append completion queue from offline CPU. */ ++ *list_skb = oldsd->completion_queue; ++ oldsd->completion_queue = NULL; ++ ++ /* Find end of our output_queue. */ ++ list_net = &sd->output_queue; ++ while (*list_net) ++ list_net = &(*list_net)->next_sched; ++ /* Append output queue from offline CPU. */ ++ *list_net = oldsd->output_queue; ++ oldsd->output_queue = NULL; ++ ++ raise_softirq_irqoff(NET_TX_SOFTIRQ); ++ local_irq_enable(); ++ ++ /* Process offline CPU's input_pkt_queue */ ++ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) ++ netif_rx(skb); ++ ++ return NOTIFY_OK; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++ ++/* ++ * Initialize the DEV module. At boot time this walks the device list and ++ * unhooks any devices that fail to initialise (normally hardware not ++ * present) and leaves us with a valid list of present and active devices. ++ * ++ */ ++ ++/* ++ * This is called single threaded during boot, so no need ++ * to take the rtnl semaphore. ++ */ ++static int __init net_dev_init(void) ++{ ++ int i, rc = -ENOMEM; ++ ++ BUG_ON(!dev_boot_phase); ++ ++ net_random_init(); ++ ++ if (dev_proc_init()) ++ goto out; ++ ++ if (netdev_sysfs_init()) ++ goto out; ++ ++ INIT_LIST_HEAD(&ptype_all); ++ for (i = 0; i < 16; i++) ++ INIT_LIST_HEAD(&ptype_base[i]); ++ ++ for (i = 0; i < ARRAY_SIZE(dev_name_head); i++) ++ INIT_HLIST_HEAD(&dev_name_head[i]); ++ ++ for (i = 0; i < ARRAY_SIZE(dev_index_head); i++) ++ INIT_HLIST_HEAD(&dev_index_head[i]); ++ ++ /* ++ * Initialise the packet receive queues. ++ */ ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ struct softnet_data *queue; ++ ++ queue = &per_cpu(softnet_data, i); ++ skb_queue_head_init(&queue->input_pkt_queue); ++ queue->throttle = 0; ++ queue->cng_level = 0; ++ queue->avg_blog = 10; /* arbitrary non-zero */ ++ queue->completion_queue = NULL; ++ INIT_LIST_HEAD(&queue->poll_list); ++ set_bit(__LINK_STATE_START, &queue->backlog_dev.state); ++ queue->backlog_dev.weight = weight_p; ++ queue->backlog_dev.poll = process_backlog; ++ atomic_set(&queue->backlog_dev.refcnt, 1); ++ } ++ ++#ifdef OFFLINE_SAMPLE ++ samp_timer.expires = jiffies + (10 * HZ); ++ add_timer(&samp_timer); ++#endif ++ ++ dev_boot_phase = 0; ++ ++ open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); ++ open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); ++ ++ hotcpu_notifier(dev_cpu_callback, 0); ++ dst_init(); ++ dev_mcast_init(); ++ rc = 0; ++out: ++ return rc; ++} ++ ++subsys_initcall(net_dev_init); ++ ++EXPORT_SYMBOL(__dev_get_by_index); ++EXPORT_SYMBOL(__dev_get_by_name); ++EXPORT_SYMBOL(__dev_remove_pack); ++EXPORT_SYMBOL(__skb_linearize); ++EXPORT_SYMBOL(dev_add_pack); ++EXPORT_SYMBOL(dev_alloc_name); ++EXPORT_SYMBOL(dev_close); ++EXPORT_SYMBOL(dev_get_by_flags); ++EXPORT_SYMBOL(dev_get_by_index); ++EXPORT_SYMBOL(dev_get_by_name); ++EXPORT_SYMBOL(dev_ioctl); ++EXPORT_SYMBOL(dev_open); ++EXPORT_SYMBOL(dev_queue_xmit); ++EXPORT_SYMBOL(dev_remove_pack); ++EXPORT_SYMBOL(dev_set_allmulti); ++EXPORT_SYMBOL(dev_set_promiscuity); ++EXPORT_SYMBOL(dev_change_flags); ++EXPORT_SYMBOL(dev_set_mtu); ++EXPORT_SYMBOL(dev_set_mac_address); ++EXPORT_SYMBOL(free_netdev); ++EXPORT_SYMBOL(netdev_boot_setup_check); ++EXPORT_SYMBOL(netdev_set_master); ++EXPORT_SYMBOL(netdev_state_change); ++EXPORT_SYMBOL(netif_receive_skb); ++EXPORT_SYMBOL(netif_rx); ++EXPORT_SYMBOL(register_gifconf); ++EXPORT_SYMBOL(register_netdevice); ++EXPORT_SYMBOL(register_netdevice_notifier); ++EXPORT_SYMBOL(skb_checksum_help); ++EXPORT_SYMBOL(synchronize_net); ++EXPORT_SYMBOL(unregister_netdevice); ++EXPORT_SYMBOL(unregister_netdevice_notifier); ++EXPORT_SYMBOL(net_enable_timestamp); ++EXPORT_SYMBOL(net_disable_timestamp); ++EXPORT_SYMBOL(dev_get_flags); ++ ++#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) ++EXPORT_SYMBOL(br_handle_frame_hook); ++EXPORT_SYMBOL(br_fdb_get_hook); ++EXPORT_SYMBOL(br_fdb_put_hook); ++#endif ++ ++#ifdef CONFIG_KMOD ++EXPORT_SYMBOL(dev_load); ++#endif ++ ++EXPORT_PER_CPU_SYMBOL(softnet_data); +diff --unified --recursive --new-file linux-2.6.12.5/net/ring/Kconfig linux-2.6.12.5-1-686-smp-ring3/net/ring/Kconfig +--- linux-2.6.12.5/net/ring/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/ring/Kconfig 2005-10-22 23:50:45.539482000 +0200 +@@ -0,0 +1,14 @@ ++config RING ++ tristate "PF_RING sockets (EXPERIMENTAL)" ++ depends on EXPERIMENTAL ++ ---help--- ++ PF_RING socket family, optimized for packet capture. ++ If a PF_RING socket is bound to an adapter (via the bind() system ++ call), such adapter will be used in read-only mode until the socket ++ is destroyed. Whenever an incoming packet is received from the adapter ++ it will not passed to upper layers, but instead it is copied to a ring ++ buffer, which in turn is exported to user space applications via mmap. ++ Please refer to http://luca.ntop.org/Ring.pdf for more. ++ ++ Say N unless you know what you are doing. ++ +diff --unified --recursive --new-file linux-2.6.12.5/net/ring/Makefile linux-2.6.12.5-1-686-smp-ring3/net/ring/Makefile +--- linux-2.6.12.5/net/ring/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/ring/Makefile 2005-10-22 23:50:45.051451500 +0200 +@@ -0,0 +1,7 @@ ++# ++# Makefile for the ring driver. ++# ++ ++obj-m += ring.o ++ ++ring-objs := ring_packet.o +diff --unified --recursive --new-file linux-2.6.12.5/net/ring/ring_packet.c linux-2.6.12.5-1-686-smp-ring3/net/ring/ring_packet.c +--- linux-2.6.12.5/net/ring/ring_packet.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/ring/ring_packet.c 2005-10-22 23:50:45.159458250 +0200 +@@ -0,0 +1,1592 @@ ++/* ++ * ++ * (C) 2004-05 - Luca Deri ++ * ++ * This code includes patches courtesy of ++ * - Jeff Randall ++ * - Helmut Manck ++ * - Brad Doctor ++ * ++ */ ++ ++/* FIX: add an entry inside the /proc filesystem */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#include ++#else ++#include ++#endif ++#include ++#include /* needed for virt_to_phys() */ ++ ++/* #define RING_DEBUG */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)) ++static inline int remap_page_range(struct vm_area_struct *vma, ++ unsigned long uvaddr, ++ unsigned long paddr, ++ unsigned long size, ++ pgprot_t prot) { ++ return(remap_pfn_range(vma, uvaddr, paddr >> PAGE_SHIFT, ++ size, prot)); ++} ++#endif ++ ++/* ************************************************* */ ++ ++#define CLUSTER_LEN 8 ++ ++struct ring_cluster { ++ u_short cluster_id; /* 0 = no cluster */ ++ u_short num_cluster_elements; ++ enum cluster_type hashing_mode; ++ u_short hashing_id; ++ struct sock *sk[CLUSTER_LEN]; ++ struct ring_cluster *next; /* NULL = last element of the cluster */ ++}; ++ ++/* ************************************************* */ ++ ++struct ring_element { ++ struct list_head list; ++ struct sock *sk; ++}; ++ ++/* ************************************************* */ ++ ++struct ring_opt { ++ struct net_device *ring_netdev; ++ ++ /* Cluster */ ++ u_short cluster_id; /* 0 = no cluster */ ++ ++ /* Reflector */ ++ struct net_device *reflector_dev; ++ ++ /* Packet buffers */ ++ unsigned long order; ++ ++ /* Ring Slots */ ++ unsigned long ring_memory; ++ FlowSlotInfo *slots_info; /* Basically it points to ring_memory */ ++ char *ring_slots; /* Basically it points to ring_memory ++ +sizeof(FlowSlotInfo) */ ++ ++ /* Packet Sampling */ ++ u_int pktToSample, sample_rate; ++ ++ /* BPF Filter */ ++ struct sk_filter *bpfFilter; ++ ++ /* Locks */ ++ atomic_t num_ring_slots_waiters; ++ wait_queue_head_t ring_slots_waitqueue; ++ rwlock_t ring_index_lock; ++ ++ /* Indexes (Internal) */ ++ u_int insert_page_id, insert_slot_id; ++}; ++ ++/* ************************************************* */ ++ ++/* List of all ring sockets. */ ++static struct list_head ring_table; ++ ++/* List of all clusters */ ++static struct ring_cluster *ring_cluster_list; ++ ++static rwlock_t ring_mgmt_lock = RW_LOCK_UNLOCKED; ++ ++/* ********************************** */ ++ ++/* Forward */ ++static struct proto_ops ring_ops; ++ ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++static struct proto ring_proto; ++#endif ++ ++static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet, ++ u_char real_skb); ++static int buffer_ring_handler(struct net_device *dev, char *data, int len); ++static int remove_from_cluster(struct sock *sock, struct ring_opt *pfr); ++ ++/* Extern */ ++ ++/* ********************************** */ ++ ++/* Defaults */ ++static u_int bucket_len = 128, num_slots = 4096, sample_rate = 1, ++ transparent_mode = 0, enable_tx_capture = 0; ++ ++MODULE_PARM(bucket_len, "i"); ++MODULE_PARM_DESC(bucket_len, "Number of ring buckets"); ++MODULE_PARM(num_slots, "i"); ++MODULE_PARM_DESC(num_slots, "Number of ring slots"); ++MODULE_PARM(sample_rate, "i"); ++MODULE_PARM_DESC(sample_rate, "Ring packet sample rate"); ++MODULE_PARM(transparent_mode, "i"); ++MODULE_PARM_DESC(transparent_mode, ++ "Set to 1 to set transparent mode " ++ "(slower but backwards compatible)"); ++MODULE_PARM(enable_tx_capture, "i"); ++MODULE_PARM_DESC(enable_tx_capture, "Set to 1 to capture outgoing packets"); ++ ++/* ********************************** */ ++ ++#define MIN_QUEUED_PKTS 64 ++#define MAX_QUEUE_LOOPS 64 ++ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#define ring_sk_datatype(__sk) ((struct ring_opt *)__sk) ++#define ring_sk(__sk) ((__sk)->sk_protinfo) ++#else ++#define ring_sk_datatype(a) (a) ++#define ring_sk(__sk) ((__sk)->protinfo.pf_ring) ++#endif ++ ++/* ++ int dev_queue_xmit(struct sk_buff *skb) ++ skb->dev; ++ struct net_device *dev_get_by_name(const char *name) ++*/ ++ ++/* ********************************** */ ++ ++static void ring_sock_destruct(struct sock *sk) { ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ skb_queue_purge(&sk->sk_receive_queue); ++ ++ if (!sock_flag(sk, SOCK_DEAD)) { ++#if defined(RING_DEBUG) ++ printk("Attempt to release alive ring socket: %p\n", sk); ++#endif ++ return; ++ } ++ ++ BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); ++ BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); ++#else ++ ++ BUG_TRAP(atomic_read(&sk->rmem_alloc)==0); ++ BUG_TRAP(atomic_read(&sk->wmem_alloc)==0); ++ ++ if (!sk->dead) { ++#if defined(RING_DEBUG) ++ printk("Attempt to release alive ring socket: %p\n", sk); ++#endif ++ return; ++ } ++#endif ++ ++ kfree(ring_sk(sk)); ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_DEC_USE_COUNT; ++#endif ++} ++ ++/* ********************************** */ ++/* ++ * ring_insert() ++ * ++ * store the sk in a new element and add it ++ * to the head of the list. ++ */ ++static inline void ring_insert(struct sock *sk) { ++ struct ring_element *next; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_insert()\n"); ++#endif ++ ++ next = kmalloc(sizeof(struct ring_element), GFP_ATOMIC); ++ if(next != NULL) { ++ next->sk = sk; ++ write_lock_irq(&ring_mgmt_lock); ++ list_add(&next->list, &ring_table); ++ write_unlock_irq(&ring_mgmt_lock); ++ } else { ++ if (net_ratelimit()) ++ printk("RING: could not kmalloc slot!!\n"); ++ } ++} ++ ++/* ********************************** */ ++/* ++ * ring_remove() ++ * ++ * For each of the elements in the list: ++ * - check if this is the element we want to delete ++ * - if it is, remove it from the list, and free it. ++ * ++ * stop when we find the one we're looking for (break), ++ * or when we reach the end of the list. ++ */ ++static inline void ring_remove(struct sock *sk) { ++ struct list_head *ptr; ++ struct ring_element *entry; ++ ++ ++ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ entry = list_entry(ptr, struct ring_element, list); ++ ++ if(entry->sk == sk) { ++ write_lock_irq(&ring_mgmt_lock); ++ list_del(ptr); ++ kfree(ptr); ++ write_unlock_irq(&ring_mgmt_lock); ++ break; ++ } ++ } ++ ++} ++ ++/* ********************************** */ ++ ++static u_int32_t num_queued_pkts(struct ring_opt *pfr) { ++ ++ if(pfr->ring_slots != NULL) { ++ ++ u_int32_t tot_insert = pfr->slots_info->insert_idx, ++#if defined(RING_DEBUG) ++ tot_read = pfr->slots_info->tot_read, tot_pkts; ++#else ++ tot_read = pfr->slots_info->tot_read; ++#endif ++ ++ if(tot_insert >= tot_read) { ++#if defined(RING_DEBUG) ++ tot_pkts = tot_insert-tot_read; ++#endif ++ return(tot_insert-tot_read); ++ } else { ++#if defined(RING_DEBUG) ++ tot_pkts = ((u_int32_t)-1)+tot_insert-tot_read; ++#endif ++ return(((u_int32_t)-1)+tot_insert-tot_read); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("-> num_queued_pkts=%d [tot_insert=%d][tot_read=%d]\n", ++ tot_pkts, tot_insert, tot_read); ++#endif ++ ++ } else ++ return(0); ++} ++ ++/* ********************************** */ ++ ++static inline FlowSlot* get_insert_slot(struct ring_opt *pfr) { ++#if defined(RING_DEBUG) ++ printk("get_insert_slot(%d)\n", pfr->slots_info->insert_idx); ++#endif ++ ++ if(pfr->ring_slots != NULL) { ++ FlowSlot *slot = (FlowSlot*)&(pfr->ring_slots[pfr->slots_info->insert_idx ++ *pfr->slots_info->slot_len]); ++ return(slot); ++ } else ++ return(NULL); ++} ++ ++/* ********************************** */ ++ ++static inline FlowSlot* get_remove_slot(struct ring_opt *pfr) { ++#if defined(RING_DEBUG) ++ printk("get_remove_slot(%d)\n", pfr->slots_info->remove_idx); ++#endif ++ ++ if(pfr->ring_slots != NULL) ++ return((FlowSlot*)&(pfr->ring_slots[pfr->slots_info->remove_idx* ++ pfr->slots_info->slot_len])); ++ else ++ return(NULL); ++} ++ ++/* ********************************** */ ++ ++static void add_skb_to_ring(struct sk_buff *skb, ++ struct ring_opt *pfr, ++ u_char recv_packet, ++ u_char real_skb /* 1=skb 0=faked skb */) { ++ FlowSlot *theSlot; ++ int idx, displ; ++ ++ if(recv_packet) { ++ /* Hack for identifying a packet received by the e1000 */ ++ if(real_skb) { ++ displ = SKB_DISPLACEMENT; ++ } else ++ displ = 0; /* Received by the e1000 wrapper */ ++ } else ++ displ = 0; ++ ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_pkts++; ++ write_unlock(&pfr->ring_index_lock); ++ ++ /* BPF Filtering (from af_packet.c) */ ++ if(pfr->bpfFilter != NULL) { ++ unsigned res = 1, len; ++ ++ len = skb->len-skb->data_len; ++ ++ write_lock(&pfr->ring_index_lock); ++ skb->data -= displ; ++ res = sk_run_filter(skb, pfr->bpfFilter->insns, pfr->bpfFilter->len); ++ skb->data += displ; ++ write_unlock(&pfr->ring_index_lock); ++ ++ if(res == 0) { ++ /* Filter failed */ ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]" ++ "[insertIdx=%d][pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ ++ return; ++ } ++ } ++ ++ /* ************************** */ ++ ++ if(pfr->sample_rate > 1) { ++ if(pfr->pktToSample == 0) { ++ write_lock(&pfr->ring_index_lock); ++ pfr->pktToSample = pfr->sample_rate; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->pktToSample--; ++ write_unlock(&pfr->ring_index_lock); ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): sampled packet [len=%d]" ++ "[tot=%llu][insertIdx=%d][pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ return; ++ } ++ } ++ ++ /* ************************************* */ ++ ++ if((pfr->reflector_dev != NULL) ++ && (!netif_queue_stopped(pfr->reflector_dev))) { ++ int cpu = smp_processor_id(); ++ ++ /* increase reference counter so that this skb is not freed */ ++ atomic_inc(&skb->users); ++ ++ skb->data -= displ; ++ ++ /* send it */ ++ if (pfr->reflector_dev->xmit_lock_owner != cpu) { ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = cpu; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++ ++ if (pfr->reflector_dev->hard_start_xmit(skb, ++ pfr->reflector_dev) == 0) { ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = -1; ++ skb->data += displ; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++#if defined(RING_DEBUG) ++ printk("++ hard_start_xmit succeeded\n"); ++#endif ++ return; /* OK */ ++ } ++ ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = -1; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("++ hard_start_xmit failed\n"); ++#endif ++ skb->data += displ; ++ return; /* -ENETDOWN */ ++ } ++ ++ /* ************************************* */ ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb) [len=%d][tot=%llu][insertIdx=%d]" ++ "[pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ ++ idx = pfr->slots_info->insert_idx; ++ theSlot = get_insert_slot(pfr); ++ ++ if((theSlot != NULL) && (theSlot->slot_state == 0)) { ++ struct pcap_pkthdr *hdr; ++ unsigned int bucketSpace; ++ char *bucket; ++ ++ /* Update Index */ ++ idx++; ++ ++ if(idx == pfr->slots_info->tot_slots) { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->insert_idx = 0; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->insert_idx = idx; ++ write_unlock(&pfr->ring_index_lock); ++ } ++ ++ bucketSpace = pfr->slots_info->slot_len ++#ifdef RING_MAGIC ++ - sizeof(u_char) ++#endif ++ - sizeof(u_char) /* flowSlot.slot_state */ ++ - sizeof(struct pcap_pkthdr) ++ - 1 /* 10 */ /* safe boundary */; ++ ++ bucket = &theSlot->bucket; ++ hdr = (struct pcap_pkthdr*)bucket; ++ ++ if(skb->stamp.tv_sec == 0) do_gettimeofday(&skb->stamp); ++ ++ hdr->ts.tv_sec = skb->stamp.tv_sec, hdr->ts.tv_usec = skb->stamp.tv_usec; ++ hdr->caplen = skb->len+displ; ++ ++ if(hdr->caplen > bucketSpace) ++ hdr->caplen = bucketSpace; ++ ++ hdr->len = skb->len+displ; ++ memcpy(&bucket[sizeof(struct pcap_pkthdr)], ++ skb->data-displ, hdr->caplen); ++ ++#if defined(RING_DEBUG) ++ { ++ static unsigned int lastLoss = 0; ++ ++ if(pfr->slots_info->tot_lost ++ && (lastLoss != pfr->slots_info->tot_lost)) { ++ printk("add_skb_to_ring(%d): [bucketSpace=%d]" ++ "[hdr.caplen=%d][skb->len=%d]" ++ "[pcap_pkthdr=%d][removeIdx=%d]" ++ "[loss=%lu][page=%u][slot=%u]\n", ++ idx-1, bucketSpace, hdr->caplen, skb->len, ++ sizeof(struct pcap_pkthdr), ++ pfr->slots_info->remove_idx, ++ (long unsigned int)pfr->slots_info->tot_lost, ++ pfr->insert_page_id, pfr->insert_slot_id); ++ ++ lastLoss = pfr->slots_info->tot_lost; ++ } ++ } ++#endif ++ ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_insert++; ++ theSlot->slot_state = 1; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_lost++; ++ write_unlock(&pfr->ring_index_lock); ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): packet lost [loss=%lu]" ++ "[removeIdx=%u][insertIdx=%u]\n", ++ (long unsigned int)pfr->slots_info->tot_lost, ++ pfr->slots_info->remove_idx, pfr->slots_info->insert_idx); ++#endif ++ } ++ ++ /* wakeup in case of poll() */ ++ if(waitqueue_active(&pfr->ring_slots_waitqueue)) ++ wake_up_interruptible(&pfr->ring_slots_waitqueue); ++} ++ ++/* ********************************** */ ++ ++static u_int hash_skb(struct ring_cluster *cluster_ptr, ++ struct sk_buff *skb, u_char recv_packet) { ++ u_int idx; ++ int displ; ++ struct iphdr *ip; ++ ++ if(cluster_ptr->hashing_mode == cluster_round_robin) { ++ idx = cluster_ptr->hashing_id++; ++ } else { ++ /* Per-flow clustering */ ++ if(skb->len > sizeof(struct iphdr)+sizeof(struct tcphdr)) { ++ if(recv_packet) ++ displ = 0; ++ else ++ displ = SKB_DISPLACEMENT; ++ ++ /* ++ skb->data+displ ++ ++ Always points to to the IP part of the packet ++ */ ++ ++ ip = (struct iphdr*)(skb->data+displ); ++ ++ idx = ip->saddr+ip->daddr+ip->protocol; ++ ++ if(ip->protocol == IPPROTO_TCP) { ++ struct tcphdr *tcp = (struct tcphdr*)(skb->data+displ ++ +sizeof(struct iphdr)); ++ idx += tcp->source+tcp->dest; ++ } else if(ip->protocol == IPPROTO_UDP) { ++ struct udphdr *udp = (struct udphdr*)(skb->data+displ ++ +sizeof(struct iphdr)); ++ idx += udp->source+udp->dest; ++ } ++ } else ++ idx = skb->len; ++ } ++ ++ return(idx % cluster_ptr->num_cluster_elements); ++} ++ ++/* ********************************** */ ++ ++static int skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, ++ u_char real_skb /* 1=skb 0=faked skb */) { ++ struct sock *skElement; ++ int rc = 0; ++ struct list_head *ptr; ++ struct ring_cluster *cluster_ptr; ++ ++ if((!skb) /* Invalid skb */ ++ || ((!enable_tx_capture) && (!recv_packet))) { ++ /* ++ An outgoing packet is about to be sent out ++ but we decided not to handle transmitted ++ packets. ++ */ ++ return(0); ++ } ++ ++#if defined(RING_DEBUG) ++ if(0) { ++ printk("skb_ring_handler() [len=%d][dev=%s]\n", skb->len, ++ skb->dev->name == NULL ? "" : skb->dev->name); ++ } ++#endif ++ ++ /* [1] Check unclustered sockets */ ++ for (ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ struct ring_opt *pfr; ++ struct ring_element *entry; ++ ++ entry = list_entry(ptr, struct ring_element, list); ++ ++ read_lock(&ring_mgmt_lock); ++ skElement = entry->sk; ++ pfr = ring_sk(skElement); ++ read_unlock(&ring_mgmt_lock); ++ ++ if((pfr != NULL) ++ && (pfr->cluster_id == 0 /* No cluster */) ++ && (pfr->ring_slots != NULL) ++ && (pfr->ring_netdev == skb->dev)) { ++ /* We've found the ring where the packet can be stored */ ++ read_lock(&ring_mgmt_lock); ++ add_skb_to_ring(skb, pfr, recv_packet, real_skb); ++ read_unlock(&ring_mgmt_lock); ++ ++ rc = 1; /* Ring found: we've done our job */ ++ } ++ } ++ ++ /* [2] Check socket clusters */ ++ cluster_ptr = ring_cluster_list; ++ ++ while(cluster_ptr != NULL) { ++ struct ring_opt *pfr; ++ ++ if(cluster_ptr->num_cluster_elements > 0) { ++ u_int skb_hash = hash_skb(cluster_ptr, skb, recv_packet); ++ ++ read_lock(&ring_mgmt_lock); ++ skElement = cluster_ptr->sk[skb_hash]; ++ read_unlock(&ring_mgmt_lock); ++ ++ if(skElement != NULL) { ++ pfr = ring_sk(skElement); ++ ++ if((pfr != NULL) ++ && (pfr->ring_slots != NULL) ++ && (pfr->ring_netdev == skb->dev)) { ++ /* We've found the ring where the packet can be stored */ ++ read_lock(&ring_mgmt_lock); ++ add_skb_to_ring(skb, pfr, recv_packet, real_skb); ++ read_unlock(&ring_mgmt_lock); ++ ++ rc = 1; /* Ring found: we've done our job */ ++ } ++ } ++ } ++ ++ cluster_ptr = cluster_ptr->next; ++ } ++ ++ if(transparent_mode) rc = 0; ++ ++ if((rc != 0) && real_skb) ++ dev_kfree_skb(skb); /* Free the skb */ ++ ++ return(rc); /* 0 = packet not handled */ ++} ++ ++/* ********************************** */ ++ ++struct sk_buff skb; ++ ++static int buffer_ring_handler(struct net_device *dev, ++ char *data, int len) { ++ ++#if defined(RING_DEBUG) ++ printk("buffer_ring_handler: [dev=%s][len=%d]\n", ++ dev->name == NULL ? "" : dev->name, len); ++#endif ++ ++ skb.dev = dev, skb.len = len, skb.data = data, ++ skb.data_len = len, skb.stamp.tv_sec = 0; /* Calculate the time */ ++ ++ skb_ring_handler(&skb, 1, 0 /* fake skb */); ++ ++ return(0); ++} ++ ++/* ********************************** */ ++ ++static int ring_create(struct socket *sock, int protocol) { ++ struct sock *sk; ++ struct ring_opt *pfr; ++ int err; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_create()\n"); ++#endif ++ ++ /* Are you root, superuser or so ? */ ++ if(!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ if(sock->type != SOCK_RAW) ++ return -ESOCKTNOSUPPORT; ++ ++ if(protocol != htons(ETH_P_ALL)) ++ return -EPROTONOSUPPORT; ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_INC_USE_COUNT; ++#endif ++ ++ err = -ENOMEM; ++ ++ // BD: -- broke this out to keep it more simple and clear as to what the ++ // options are. ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) ++ sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL); ++#endif ++#endif ++ ++ // BD: API changed in 2.6.12, ref: ++ // http://svn.clkao.org/svnweb/linux/revision/?rev=28201 ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++ sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1); ++#endif ++ ++ if (sk == NULL) ++ goto out; ++ ++ sock->ops = &ring_ops; ++ sock_init_data(sock, sk); ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) ++ sk_set_owner(sk, THIS_MODULE); ++#endif ++#endif ++ ++ err = -ENOMEM; ++ ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL)); ++ ++ if (!(pfr = ring_sk(sk))) { ++ sk_free(sk); ++ goto out; ++ } ++ memset(pfr, 0, sizeof(*pfr)); ++ init_waitqueue_head(&pfr->ring_slots_waitqueue); ++ pfr->ring_index_lock = RW_LOCK_UNLOCKED; ++ atomic_set(&pfr->num_ring_slots_waiters, 0); ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ sk->sk_family = PF_RING; ++ sk->sk_destruct = ring_sock_destruct; ++#else ++ sk->family = PF_RING; ++ sk->destruct = ring_sock_destruct; ++ sk->num = protocol; ++#endif ++ ++ ring_insert(sk); ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_create() - created\n"); ++#endif ++ ++ return(0); ++ out: ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_DEC_USE_COUNT; ++#endif ++ return err; ++} ++ ++/* *********************************************** */ ++ ++static int ring_release(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct ring_opt *pfr = ring_sk(sk); ++ ++ if(!sk) ++ return 0; ++ ++#if defined(RING_DEBUG) ++ printk("RING: called ring_release\n"); ++#endif ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_release entered\n"); ++#endif ++ ++ ring_remove(sk); ++ ++ sock_orphan(sk); ++ sock->sk = NULL; ++ ++ /* Free the ring buffer */ ++ if(pfr->ring_memory) { ++ struct page *page, *page_end; ++ ++ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1); ++ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++) ++ ClearPageReserved(page); ++ ++ free_pages(pfr->ring_memory, pfr->order); ++ } ++ ++ kfree(pfr); ++ ring_sk(sk) = NULL; ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ skb_queue_purge(&sk->sk_write_queue); ++#endif ++ sock_put(sk); ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_release leaving\n"); ++#endif ++ ++ return 0; ++} ++ ++/* ********************************** */ ++/* ++ * We create a ring for this socket and bind it to the specified device ++ */ ++static int packet_ring_bind(struct sock *sk, struct net_device *dev) ++{ ++ u_int the_slot_len; ++ u_int32_t tot_mem; ++ struct ring_opt *pfr = ring_sk(sk); ++ struct page *page, *page_end; ++ ++ if(!dev) return(-1); ++ ++#if defined(RING_DEBUG) ++ printk("RING: packet_ring_bind(%s) called\n", dev->name); ++#endif ++ ++ /* ********************************************** ++ ++ ************************************* ++ * * ++ * FlowSlotInfo * ++ * * ++ ************************************* <-+ ++ * FlowSlot * | ++ ************************************* | ++ * FlowSlot * | ++ ************************************* +- num_slots ++ * FlowSlot * | ++ ************************************* | ++ * FlowSlot * | ++ ************************************* <-+ ++ ++ ********************************************** */ ++ ++ the_slot_len = sizeof(u_char) /* flowSlot.slot_state */ ++ + sizeof(u_short) /* flowSlot.slot_len */ ++ + bucket_len /* flowSlot.bucket */; ++ ++ tot_mem = sizeof(FlowSlotInfo) + num_slots*the_slot_len; ++ ++ /* ++ Calculate the value of the order parameter used later. ++ See http://www.linuxjournal.com/article.php?sid=1133 ++ */ ++ for(pfr->order = 0;(PAGE_SIZE << pfr->order) < tot_mem; pfr->order++) ; ++ ++ /* ++ We now try to allocate the memory as required. If we fail ++ we try to allocate a smaller amount or memory (hence a ++ smaller ring). ++ */ ++ while((pfr->ring_memory = __get_free_pages(GFP_ATOMIC, pfr->order)) == 0) ++ if(pfr->order-- == 0) ++ break; ++ ++ if(pfr->order == 0) { ++#if defined(RING_DEBUG) ++ printk("ERROR: not enough memory\n"); ++#endif ++ return(-1); ++ } else { ++#if defined(RING_DEBUG) ++ printk("RING: succesfully allocated %lu KB [tot_mem=%d][order=%ld]\n", ++ PAGE_SIZE >> (10 - pfr->order), tot_mem, pfr->order); ++#endif ++ } ++ ++ tot_mem = PAGE_SIZE << pfr->order; ++ memset((char*)pfr->ring_memory, 0, tot_mem); ++ ++ /* Now we need to reserve the pages */ ++ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1); ++ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++) ++ SetPageReserved(page); ++ ++ pfr->slots_info = (FlowSlotInfo*)pfr->ring_memory; ++ pfr->ring_slots = (char*)(pfr->ring_memory+sizeof(FlowSlotInfo)); ++ ++ pfr->slots_info->version = RING_FLOWSLOT_VERSION; ++ pfr->slots_info->slot_len = the_slot_len; ++ pfr->slots_info->tot_slots = (tot_mem-sizeof(FlowSlotInfo))/the_slot_len; ++ pfr->slots_info->tot_mem = tot_mem; ++ pfr->slots_info->sample_rate = sample_rate; ++ ++#if defined(RING_DEBUG) ++ printk("RING: allocated %d slots [slot_len=%d][tot_mem=%u]\n", ++ pfr->slots_info->tot_slots, pfr->slots_info->slot_len, ++ pfr->slots_info->tot_mem); ++#endif ++ ++#ifdef RING_MAGIC ++ { ++ int i; ++ ++ for(i=0; islots_info->tot_slots; i++) { ++ unsigned long idx = i*pfr->slots_info->slot_len; ++ FlowSlot *slot = (FlowSlot*)&pfr->ring_slots[idx]; ++ slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0; ++ } ++ } ++#endif ++ ++ pfr->insert_page_id = 1, pfr->insert_slot_id = 0; ++ ++ /* ++ IMPORTANT ++ Leave this statement here as last one. In fact when ++ the ring_netdev != NULL the socket is ready to be used. ++ */ ++ pfr->ring_netdev = dev; ++ ++ return(0); ++} ++ ++/* ************************************* */ ++ ++/* Bind to a device */ ++static int ring_bind(struct socket *sock, ++ struct sockaddr *sa, int addr_len) ++{ ++ struct sock *sk=sock->sk; ++ struct net_device *dev = NULL; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_bind() called\n"); ++#endif ++ ++ /* ++ * Check legality ++ */ ++ if (addr_len != sizeof(struct sockaddr)) ++ return -EINVAL; ++ if (sa->sa_family != PF_RING) ++ return -EINVAL; ++ ++ /* Safety check: add trailing zero if missing */ ++ sa->sa_data[sizeof(sa->sa_data)-1] = '\0'; ++ ++#if defined(RING_DEBUG) ++ printk("RING: searching device %s\n", sa->sa_data); ++#endif ++ ++ if((dev = __dev_get_by_name(sa->sa_data)) == NULL) { ++#if defined(RING_DEBUG) ++ printk("RING: search failed\n"); ++#endif ++ return(-EINVAL); ++ } else ++ return(packet_ring_bind(sk, dev)); ++} ++ ++/* ************************************* */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ ++volatile void* virt_to_kseg(volatile void* address) { ++ pte_t *pte; ++ pud_t *pud; ++ unsigned long addr = (unsigned long)address; ++ ++ pud = pud_offset(pgd_offset_k((unsigned long) address), ++ (unsigned long) address); ++ ++ /* ++ High-memory support courtesy of ++ Brad Doctor ++ */ ++#if defined(CONFIG_X86_PAE) && (!defined(CONFIG_NOHIGHMEM)) ++ pte = pte_offset_map(pmd_offset(pud, addr), addr); ++#else ++ pte = pmd_offset_map(pud, addr); ++#endif ++ ++ return((volatile void*)pte_page(*pte)); ++} ++ ++#else /* 2.4 */ ++ ++/* http://www.scs.ch/~frey/linux/memorymap.html */ ++volatile void *virt_to_kseg(volatile void *address) ++{ ++ pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; ++ unsigned long va, ret = 0UL; ++ ++ va=VMALLOC_VMADDR((unsigned long)address); ++ ++ /* get the page directory. Use the kernel memory map. */ ++ pgd = pgd_offset_k(va); ++ ++ /* check whether we found an entry */ ++ if (!pgd_none(*pgd)) ++ { ++ /* get the page middle directory */ ++ pmd = pmd_offset(pgd, va); ++ /* check whether we found an entry */ ++ if (!pmd_none(*pmd)) ++ { ++ /* get a pointer to the page table entry */ ++ ptep = pte_offset(pmd, va); ++ pte = *ptep; ++ /* check for a valid page */ ++ if (pte_present(pte)) ++ { ++ /* get the address the page is refering to */ ++ ret = (unsigned long)page_address(pte_page(pte)); ++ /* add the offset within the page to the page address */ ++ ret |= (va & (PAGE_SIZE -1)); ++ } ++ } ++ } ++ return((volatile void *)ret); ++} ++#endif ++ ++/* ************************************* */ ++ ++static int ring_mmap(struct file *file, ++ struct socket *sock, ++ struct vm_area_struct *vma) ++{ ++ struct sock *sk = sock->sk; ++ struct ring_opt *pfr = ring_sk(sk); ++ unsigned long size, start; ++ u_int pagesToMap; ++ char *ptr; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() called\n"); ++#endif ++ ++ if(pfr->ring_memory == 0) { ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() failed: mapping area to an unbound socket\n"); ++#endif ++ return -EINVAL; ++ } ++ ++ size = (unsigned long)(vma->vm_end-vma->vm_start); ++ ++ if(size % PAGE_SIZE) { ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() failed: len is not multiple of PAGE_SIZE\n"); ++#endif ++ return(-EINVAL); ++ } ++ ++ /* if userspace tries to mmap beyond end of our buffer, fail */ ++ if(size > pfr->slots_info->tot_mem) { ++#if defined(RING_DEBUG) ++ printk("proc_mmap() failed: area too large [%ld > %d]\n", size, pfr->slots_info->tot_mem); ++#endif ++ return(-EINVAL); ++ } ++ ++ pagesToMap = size/PAGE_SIZE; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() called. %d pages to map\n", pagesToMap); ++#endif ++ ++#if defined(RING_DEBUG) ++ printk("RING: mmap [slot_len=%d][tot_slots=%d] for ring on device %s\n", ++ pfr->slots_info->slot_len, pfr->slots_info->tot_slots, ++ pfr->ring_netdev->name); ++#endif ++ ++ /* we do not want to have this area swapped out, lock it */ ++ vma->vm_flags |= VM_LOCKED; ++ start = vma->vm_start; ++ ++ /* Ring slots start from page 1 (page 0 is reserved for FlowSlotInfo) */ ++ ptr = (char*)(start+PAGE_SIZE); ++ ++ if(remap_page_range( ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ vma, ++#endif ++ start, ++ __pa(pfr->ring_memory), ++ PAGE_SIZE*pagesToMap, vma->vm_page_prot)) { ++#if defined(RING_DEBUG) ++ printk("remap_page_range() failed\n"); ++#endif ++ return(-EAGAIN); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("proc_mmap(pagesToMap=%d): success.\n", pagesToMap); ++#endif ++ ++ return 0; ++} ++ ++/* ************************************* */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++static int ring_recvmsg(struct kiocb *iocb, struct socket *sock, ++ struct msghdr *msg, size_t len, int flags) ++#else ++ static int ring_recvmsg(struct socket *sock, struct msghdr *msg, int len, ++ int flags, struct scm_cookie *scm) ++#endif ++{ ++ FlowSlot* slot; ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ u_int32_t queued_pkts, num_loops = 0; ++ ++#if defined(RING_DEBUG) ++ printk("ring_recvmsg called\n"); ++#endif ++ ++ slot = get_remove_slot(pfr); ++ ++ while((queued_pkts = num_queued_pkts(pfr)) < MIN_QUEUED_PKTS) { ++ wait_event_interruptible(pfr->ring_slots_waitqueue, 1); ++ ++#if defined(RING_DEBUG) ++ printk("-> ring_recvmsg returning %d [queued_pkts=%d][num_loops=%d]\n", ++ slot->slot_state, queued_pkts, num_loops); ++#endif ++ ++ if(queued_pkts > 0) { ++ if(num_loops++ > MAX_QUEUE_LOOPS) ++ break; ++ } ++ } ++ ++#if defined(RING_DEBUG) ++ if(slot != NULL) ++ printk("ring_recvmsg is returning [queued_pkts=%d][num_loops=%d]\n", ++ queued_pkts, num_loops); ++#endif ++ ++ return(queued_pkts); ++} ++ ++/* ************************************* */ ++ ++unsigned int ring_poll(struct file * file, ++ struct socket *sock, poll_table *wait) ++{ ++ FlowSlot* slot; ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ ++#if defined(RING_DEBUG) ++ printk("poll called\n"); ++#endif ++ ++ slot = get_remove_slot(pfr); ++ ++ if((slot != NULL) && (slot->slot_state == 0)) ++ poll_wait(file, &pfr->ring_slots_waitqueue, wait); ++ ++#if defined(RING_DEBUG) ++ printk("poll returning %d\n", slot->slot_state); ++#endif ++ ++ if((slot != NULL) && (slot->slot_state == 1)) ++ return(POLLIN | POLLRDNORM); ++ else ++ return(0); ++} ++ ++/* ************************************* */ ++ ++int add_to_cluster_list(struct ring_cluster *el, ++ struct sock *sock) { ++ ++ if(el->num_cluster_elements == CLUSTER_LEN) ++ return(-1); /* Cluster full */ ++ ++ ring_sk_datatype(ring_sk(sock))->cluster_id = el->cluster_id; ++ el->sk[el->num_cluster_elements] = sock; ++ el->num_cluster_elements++; ++ return(0); ++} ++ ++/* ************************************* */ ++ ++int remove_from_cluster_list(struct ring_cluster *el, ++ struct sock *sock) { ++ int i, j; ++ ++ for(i=0; isk[i] == sock) { ++ el->num_cluster_elements--; ++ ++ if(el->num_cluster_elements > 0) { ++ /* The cluster contains other elements */ ++ for(j=i; jsk[j] = el->sk[j+1]; ++ ++ el->sk[CLUSTER_LEN-1] = NULL; ++ } else { ++ /* Empty cluster */ ++ memset(el->sk, 0, sizeof(el->sk)); ++ } ++ ++ return(0); ++ } ++ ++ return(-1); /* Not found */ ++} ++ ++/* ************************************* */ ++ ++static int remove_from_cluster(struct sock *sock, ++ struct ring_opt *pfr) ++{ ++ struct ring_cluster *el; ++ ++#if defined(RING_DEBUG) ++ printk("--> remove_from_cluster(%d)\n", pfr->cluster_id); ++#endif ++ ++ if(pfr->cluster_id == 0 /* 0 = No Cluster */) ++ return(0); /* Noting to do */ ++ ++ el = ring_cluster_list; ++ ++ while(el != NULL) { ++ if(el->cluster_id == pfr->cluster_id) { ++ return(remove_from_cluster_list(el, sock)); ++ } else ++ el = el->next; ++ } ++ ++ return(-EINVAL); /* Not found */ ++} ++ ++/* ************************************* */ ++ ++static int add_to_cluster(struct sock *sock, ++ struct ring_opt *pfr, ++ u_short cluster_id) ++{ ++ struct ring_cluster *el; ++ ++#ifndef RING_DEBUG ++ printk("--> add_to_cluster(%d)\n", cluster_id); ++#endif ++ ++ if(cluster_id == 0 /* 0 = No Cluster */) return(-EINVAL); ++ ++ if(pfr->cluster_id != 0) ++ remove_from_cluster(sock, pfr); ++ ++ el = ring_cluster_list; ++ ++ while(el != NULL) { ++ if(el->cluster_id == cluster_id) { ++ return(add_to_cluster_list(el, sock)); ++ } else ++ el = el->next; ++ } ++ ++ /* There's no existing cluster. We need to create one */ ++ if((el = kmalloc(sizeof(struct ring_cluster), GFP_KERNEL)) == NULL) ++ return(-ENOMEM); ++ ++ el->cluster_id = cluster_id; ++ el->num_cluster_elements = 1; ++ el->hashing_mode = cluster_per_flow; /* Default */ ++ el->hashing_id = 0; ++ ++ memset(el->sk, 0, sizeof(el->sk)); ++ el->sk[0] = sock; ++ el->next = ring_cluster_list; ++ ring_cluster_list = el; ++ pfr->cluster_id = cluster_id; ++ ++ return(0); /* 0 = OK */ ++} ++ ++/* ************************************* */ ++ ++/* Code taken/inspired from core/sock.c */ ++static int ring_setsockopt(struct socket *sock, ++ int level, int optname, ++ char *optval, int optlen) ++{ ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ int val, found, ret = 0; ++ u_int cluster_id; ++ char devName[8]; ++ ++ if((optleninsns, fprog.filter, fsize)) ++ break; ++ ++ filter->len = fprog.len; ++ ++ if(sk_chk_filter(filter->insns, filter->len) != 0) { ++ /* Bad filter specified */ ++ kfree(filter); ++ pfr->bpfFilter = NULL; ++ break; ++ } ++ ++ /* get the lock, set the filter, release the lock */ ++ write_lock(&ring_mgmt_lock); ++ pfr->bpfFilter = filter; ++ write_unlock(&ring_mgmt_lock); ++ } ++ ret = 0; ++ break; ++ ++ case SO_DETACH_FILTER: ++ write_lock(&ring_mgmt_lock); ++ found = 1; ++ if(pfr->bpfFilter != NULL) { ++ kfree(pfr->bpfFilter); ++ pfr->bpfFilter = NULL; ++ write_unlock(&ring_mgmt_lock); ++ break; ++ } ++ ret = -ENONET; ++ break; ++ ++ case SO_ADD_TO_CLUSTER: ++ if (optlen!=sizeof(val)) ++ return -EINVAL; ++ ++ if (copy_from_user(&cluster_id, optval, sizeof(cluster_id))) ++ return -EFAULT; ++ ++ write_lock(&ring_mgmt_lock); ++ ret = add_to_cluster(sock->sk, pfr, cluster_id); ++ write_unlock(&ring_mgmt_lock); ++ break; ++ ++ case SO_REMOVE_FROM_CLUSTER: ++ write_lock(&ring_mgmt_lock); ++ ret = remove_from_cluster(sock->sk, pfr); ++ write_unlock(&ring_mgmt_lock); ++ break; ++ ++ case SO_SET_REFLECTOR: ++ if(optlen >= (sizeof(devName)-1)) ++ return -EINVAL; ++ ++ if(optlen > 0) { ++ if(copy_from_user(devName, optval, optlen)) ++ return -EFAULT; ++ } ++ ++ devName[optlen] = '\0'; ++ ++#if defined(RING_DEBUG) ++ printk("+++ SO_SET_REFLECTOR(%s)\n", devName); ++#endif ++ ++ write_lock(&ring_mgmt_lock); ++ pfr->reflector_dev = dev_get_by_name(devName); ++ write_unlock(&ring_mgmt_lock); ++ ++#if defined(RING_DEBUG) ++ if(pfr->reflector_dev != NULL) ++ printk("SO_SET_REFLECTOR(%s): succeded\n", devName); ++ else ++ printk("SO_SET_REFLECTOR(%s): device unknown\n", devName); ++#endif ++ break; ++ ++ default: ++ found = 0; ++ break; ++ } ++ ++ if(found) ++ return(ret); ++ else ++ return(sock_setsockopt(sock, level, optname, optval, optlen)); ++} ++ ++/* ************************************* */ ++ ++static int ring_ioctl(struct socket *sock, ++ unsigned int cmd, unsigned long arg) ++{ ++ switch(cmd) ++ { ++ case SIOCGIFFLAGS: ++ case SIOCSIFFLAGS: ++ case SIOCGIFCONF: ++ case SIOCGIFMETRIC: ++ case SIOCSIFMETRIC: ++ case SIOCGIFMEM: ++ case SIOCSIFMEM: ++ case SIOCGIFMTU: ++ case SIOCSIFMTU: ++ case SIOCSIFLINK: ++ case SIOCGIFHWADDR: ++ case SIOCSIFHWADDR: ++ case SIOCSIFMAP: ++ case SIOCGIFMAP: ++ case SIOCSIFSLAVE: ++ case SIOCGIFSLAVE: ++ case SIOCGIFINDEX: ++ case SIOCGIFNAME: ++ case SIOCGIFCOUNT: ++ case SIOCSIFHWBROADCAST: ++ return(dev_ioctl(cmd,(void *) arg)); ++ ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return 0; ++} ++ ++/* ************************************* */ ++ ++static struct proto_ops ring_ops = { ++ .family = PF_RING, ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ .owner = THIS_MODULE, ++#endif ++ ++ /* Operations that make no sense on ring sockets. */ ++ .connect = sock_no_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = sock_no_getname, ++ .listen = sock_no_listen, ++ .shutdown = sock_no_shutdown, ++ .sendpage = sock_no_sendpage, ++ .sendmsg = sock_no_sendmsg, ++ .getsockopt = sock_no_getsockopt, ++ ++ /* Now the operations that really occur. */ ++ .release = ring_release, ++ .bind = ring_bind, ++ .mmap = ring_mmap, ++ .poll = ring_poll, ++ .setsockopt = ring_setsockopt, ++ .ioctl = ring_ioctl, ++ .recvmsg = ring_recvmsg, ++}; ++ ++/* ************************************ */ ++ ++static struct net_proto_family ring_family_ops = { ++ .family = PF_RING, ++ .create = ring_create, ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ .owner = THIS_MODULE, ++#endif ++}; ++ ++// BD: API changed in 2.6.12, ref: ++// http://svn.clkao.org/svnweb/linux/revision/?rev=28201 ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++static struct proto ring_proto = { ++ .name = "PF_RING", ++ .owner = THIS_MODULE, ++ .obj_size = sizeof(struct sock), ++}; ++#endif ++ ++/* ************************************ */ ++ ++static void __exit ring_exit(void) ++{ ++ struct list_head *ptr; ++ struct ring_element *entry; ++ ++ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ entry = list_entry(ptr, struct ring_element, list); ++ kfree(entry); ++ } ++ ++ while(ring_cluster_list != NULL) { ++ struct ring_cluster *next = ring_cluster_list->next; ++ kfree(ring_cluster_list); ++ ring_cluster_list = next; ++ } ++ ++ set_skb_ring_handler(NULL); ++ set_buffer_ring_handler(NULL); ++ sock_unregister(PF_RING); ++ ++ printk("PF_RING shut down.\n"); ++} ++ ++/* ************************************ */ ++ ++static int __init ring_init(void) ++{ ++ printk("Welcome to PF_RING %s\n(C) 2004 L.Deri \n", ++ RING_VERSION); ++ ++ INIT_LIST_HEAD(&ring_table); ++ ring_cluster_list = NULL; ++ ++ sock_register(&ring_family_ops); ++ ++ set_skb_ring_handler(skb_ring_handler); ++ set_buffer_ring_handler(buffer_ring_handler); ++ ++ if(get_buffer_ring_handler() != buffer_ring_handler) { ++ printk("PF_RING: set_buffer_ring_handler FAILED\n"); ++ ++ set_skb_ring_handler(NULL); ++ set_buffer_ring_handler(NULL); ++ sock_unregister(PF_RING); ++ return -1; ++ } else { ++ printk("PF_RING: bucket length %d bytes\n", bucket_len); ++ printk("PF_RING: ring slots %d\n", num_slots); ++ printk("PF_RING: sample rate %d [1=no sampling]\n", sample_rate); ++ printk("PF_RING: capture TX %s\n", ++ enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]"); ++ printk("PF_RING: transparent mode %s\n", ++ transparent_mode ? "Yes" : "No"); ++ ++ printk("PF_RING initialized correctly.\n"); ++ return 0; ++ } ++} ++ ++module_init(ring_init); ++module_exit(ring_exit); ++MODULE_LICENSE("GPL"); ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++MODULE_ALIAS_NETPROTO(PF_RING); ++#endif -- 2.11.0