/*
 * IPVS         An implementation of the IP virtual server support for the
 *              LINUX operating system.  IPVS is now implemented as a module
 *              over the NetFilter framework. IPVS can be used to build a
 *              high-performance and highly available server based on a
 *              cluster of servers.
 *
 * Version:     $Id: ip_vs_ctl.c,v 1.3 2000/07/06 14:57:09 wensong Exp $
 *
 * Authors:     Wensong Zhang <wensong@iinchina.net>
 *              Peter Kese <peter.kese@ijs.si>
 *              Julian Anastasov <uli@linux.tu-varna.acad.bg>
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or (at your option) any later version.
 *
 * Changes:
 *
 */

#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/timer.h>
#include <linux/swap.h>
#include <linux/proc_fs.h>

#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>

#include <net/ip.h>
#include <net/sock.h>

#include <asm/uaccess.h>

#include "ip_vs.h"
//#include <net/ip_vs.h>

/* spinlock for do_ip_vs_set_ctl */
spinlock_t __ip_vs_mutex = SPIN_LOCK_UNLOCKED;

/* lock for service table */
rwlock_t __ip_vs_svc_lock = RW_LOCK_UNLOCKED;

/* lock for table with the real services */
rwlock_t __ip_vs_rs_lock = RW_LOCK_UNLOCKED;

/* lock for state and timeout tables */
rwlock_t __ip_vs_securetcp_lock = RW_LOCK_UNLOCKED;

/* lock for drop entry handling */
spinlock_t __ip_vs_dropentry_lock = SPIN_LOCK_UNLOCKED;

/* lock for drop packet handling */
spinlock_t __ip_vs_droppacket_lock = SPIN_LOCK_UNLOCKED;

int ip_vs_drop_rate = 0;
int ip_vs_drop_counter = 0;
atomic_t ip_vs_dropentry = ATOMIC_INIT(0);

/* sysctl variables */
int sysctl_ip_vs_debug_level = 0;
int sysctl_ip_vs_drop_entry = 0;
int sysctl_ip_vs_drop_packet = 0;
int sysctl_ip_vs_secure_tcp = 0;
int sysctl_ip_vs_amemthresh = 1024;
int sysctl_ip_vs_am_droprate = 10;

int ip_vs_get_debug_level(void)
{
        return sysctl_ip_vs_debug_level;
}


/*
 *  update_defense_level is called from timer bh and from sysctl.
 */
void update_defense_level(void)
{
	int ip_vs_amem = nr_free_pages() + atomic_read(&page_cache_size) +
		atomic_read(&buffermem_pages);
	int nomem = (ip_vs_amem < sysctl_ip_vs_amemthresh);

	/* drop_entry */
	spin_lock(&__ip_vs_dropentry_lock);
	switch (sysctl_ip_vs_drop_entry) {
	case 0:
		atomic_set(&ip_vs_dropentry,0);
		break;
	case 1:
		if (nomem) {
			atomic_set(&ip_vs_dropentry,1);
			sysctl_ip_vs_drop_entry = 2;
		} else {
			atomic_set(&ip_vs_dropentry,0);
		}
		break;
	case 2:
		if (nomem) {
			atomic_set(&ip_vs_dropentry,1);
		} else {
			atomic_set(&ip_vs_dropentry,0);
			sysctl_ip_vs_drop_entry = 1;
		};
		break;
	case 3:
		atomic_set(&ip_vs_dropentry,1);
		break;
	}
	spin_unlock(&__ip_vs_dropentry_lock);

	/* drop_packet */
	spin_lock(&__ip_vs_droppacket_lock);
	switch (sysctl_ip_vs_drop_packet) {
	case 0:
		ip_vs_drop_rate = 0;
		break;
	case 1:
		if (nomem) {
			ip_vs_drop_rate = ip_vs_drop_counter
				= sysctl_ip_vs_amemthresh /
				(sysctl_ip_vs_amemthresh-ip_vs_amem);
			sysctl_ip_vs_drop_packet = 2;
		} else {
			ip_vs_drop_rate = 0;
		}
		break;
	case 2:
		if (nomem) {
			ip_vs_drop_rate = ip_vs_drop_counter
				= sysctl_ip_vs_amemthresh /
				(sysctl_ip_vs_amemthresh-ip_vs_amem);
		} else {
			ip_vs_drop_rate = 0;
			sysctl_ip_vs_drop_packet = 1;
		}
		break;
	case 3:
		ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
		break;
	}
	spin_unlock(&__ip_vs_droppacket_lock);

	/* secure_tcp */
	write_lock(&__ip_vs_securetcp_lock);
	switch (sysctl_ip_vs_secure_tcp) {
	case 0:
		ip_vs_secure_tcp_set(0);
		break;
	case 1:
		if (nomem) {
			ip_vs_secure_tcp_set(1);
			sysctl_ip_vs_secure_tcp = 2;
		} else {
			ip_vs_secure_tcp_set(0);
		}
		break;
	case 2:
		if (nomem) {
			ip_vs_secure_tcp_set(1);
		} else {
			ip_vs_secure_tcp_set(0);
			sysctl_ip_vs_secure_tcp = 1;
		}
		break;
	case 3:
		ip_vs_secure_tcp_set(1);
		break;
	}
	write_unlock(&__ip_vs_securetcp_lock);
}


/*
 *  Hash table: for virtual service lookups
 */
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)

/* the service table hashed by <protocol, addr, port> */
struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];

/*
 *  Hash table: for real service lookups
 */
#define IP_VS_RTAB_BITS 4
#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)

struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];

/*
 * Trash for destinations
 */
LIST_HEAD(ip_vs_dest_trash);

/*
 * FTP & NULL virtual service counters
 */
atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);


/*
 *  Returns hash value for virtual service
 */
static __inline__ unsigned
ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
{
	register unsigned porth = ntohs(port);
	
	return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
		& IP_VS_SVC_TAB_MASK;
}

/*
 *  Returns hash value of fwmark for virtual service lookup
 */
static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
{
	return fwmark & IP_VS_SVC_TAB_MASK;
}

/*
 *  Hashes ip_vs_service in the ip_vs_svc_table by <proto,addr,port>
 *  or in the ip_vs_svc_fwm_table by fwmark.
 *  Should be called with locked tables.
 *  Returns bool success.
 */
int ip_vs_svc_hash(struct ip_vs_service *svc)
{
	unsigned hash;

	if (svc->flags & IP_VS_SVC_F_HASHED) {
		IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
			  "called from %p\n", __builtin_return_address(0));
		return 0;
	}

	if (svc->fwmark == 0) {
		/*
		 *	Hash by <protocol,addr,port> in ip_vs_svc_table
		 */
		hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
	} else {
		/*
		 *	Hash by fwmark in ip_vs_svc_fwm_table
		 */
		hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
	}
	
	svc->flags |= IP_VS_SVC_F_HASHED;
	return 1;
}


/*
 *  Unhashes ip_vs_service from ip_vs_svc_table/ip_vs_svc_fwm_table.
 *  Should be called with locked tables.
 *  Returns bool success.
 */
int ip_vs_svc_unhash(struct ip_vs_service *svc)
{
	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
		IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
			  "called from %p\n", __builtin_return_address(0));
		return 0;
	}

	if (svc->fwmark == 0) {
		/*
		 * Remove it from the ip_vs_svc_table table.
		 */
		list_del(&svc->s_list);
	} else {
		/*
		 * Remove it from the ip_vs_svc_fwm_table table.
		 */
		list_del(&svc->f_list);
	}

	svc->flags &= ~IP_VS_SVC_F_HASHED;
	return 1;
}


/*
 *  Lookup service by {proto,addr,port} in the service table.
 */
static __inline__ struct ip_vs_service *
__ip_vs_lookup_service(__u16 protocol, __u32 vaddr, __u16 vport)
{
	unsigned hash;
	struct ip_vs_service *svc;
	struct list_head *l,*e;

	/*
	 *	Check for "full" addressed entries
	 */
	hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
	
	l = &ip_vs_svc_table[hash];
	for (e=l->next; e!=l; e=e->next) {
		svc = list_entry(e, struct ip_vs_service, s_list);
		if ((svc->addr == vaddr)
		    && (svc->port == vport)
		    && (svc->protocol == protocol)) {
			/* HIT */
			atomic_inc(&svc->refcnt);
			return svc;
		}
	}

	return NULL;
}


/*
 *  Lookup service by fwmark in the service table.
 */
static __inline__ struct ip_vs_service * __ip_vs_lookup_svc_fwm(__u32 fwmark)
{
	unsigned hash;
	struct ip_vs_service *svc;
	struct list_head *l,*e;

	/*
	 *	Check for "full" addressed entries
	 */
	hash = ip_vs_svc_fwm_hashkey(fwmark);
	
	l = &ip_vs_svc_fwm_table[hash];
	for (e=l->next; e!=l; e=e->next) {
		svc = list_entry(e, struct ip_vs_service, f_list);
		if (svc->fwmark == fwmark) {
			/* HIT */
			atomic_inc(&svc->refcnt);
			return svc;
		}
	}

	return NULL;
}

struct ip_vs_service *
ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
{
	struct ip_vs_service *svc;

	read_lock(&__ip_vs_svc_lock);

	if (fwmark) {
		/*	
		 *	Check the table hashed by fwmark
		 */
		svc = __ip_vs_lookup_svc_fwm(fwmark);
	} else {
		/*	
		 *	Check the table hashed by <protocol,addr,port>
		 *	first for "full" addressed entries
		 */
		svc = __ip_vs_lookup_service(protocol, vaddr, vport);

		if (svc == NULL
                    && protocol == IPPROTO_TCP
                    && atomic_read(&ip_vs_ftpsvc_counter)
                    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
			/*
			 * Check if ftp service entry exists, the packet
			 * might belong to FTP data connections.
			 */
			svc = __ip_vs_lookup_service(protocol, vaddr, FTPPORT);
		}

		if (svc == NULL
                    && atomic_read(&ip_vs_nullsvc_counter)) {
			/*
			 * Check if the catch-all port (port zero) exists
			 */
			svc = __ip_vs_lookup_service(protocol, vaddr, 0);
		}
	}

	read_unlock(&__ip_vs_svc_lock);

        IP_VS_DBG(2, "lookup_service fwm %d %s %u.%u.%u.%u:%u %s\n",
                  fwmark, vs_proto_name(protocol),
                  NIPQUAD(vaddr), ntohs(vport),
                  svc?"hit":"not hit");

	return svc;
}


/*
 *  Returns hash value for real service
 */
static __inline__ unsigned
ip_vs_rs_hashkey(unsigned proto, __u32 addr, __u16 port)
{
	register unsigned porth = ntohs(port);
	
	return (proto^ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
		& IP_VS_RTAB_MASK;
}

/*
 *  Hashes ip_vs_dest in ip_vs_rtable by proto,addr,port.
 *  should be called with locked tables.
 *  returns bool success.
 */
int ip_vs_rs_hash(struct ip_vs_dest *dest)
{
	unsigned hash;

	if (!list_empty(&dest->d_list)) {
		return 0;
	}

	/*
	 *	Hash by proto,addr,port,
	 *	which are the parameters of the real service.
	 */
	hash = ip_vs_rs_hashkey(0, dest->addr, dest->port);
	list_add(&dest->d_list, &ip_vs_rtable[hash]);

	return 1;
}

/*
 *  UNhashes ip_vs_dest from ip_vs_rtable.
 *  should be called with locked tables.
 *  returns bool success.
 */
int ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
	/*
	 * Remove it from the ip_vs_rtable table.
	 */
	if (!list_empty(&dest->d_list)) {
		list_del(&dest->d_list);
		INIT_LIST_HEAD(&dest->d_list);
	}

	return 1;
}

/*
 *  Lookup real service by {proto,addr,port} in the real service table.
 */
struct ip_vs_dest *
ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
{
	unsigned hash;
	struct ip_vs_dest *dest;
	struct list_head *l,*e;

	/*
	 *	Check for "full" addressed entries
	 *	Return the first found entry
	 */
	hash = ip_vs_rs_hashkey(0, daddr, dport);
	
	l = &ip_vs_rtable[hash];
	
	read_lock(&__ip_vs_rs_lock);
	for (e=l->next; e!=l; e=e->next) {
		dest = list_entry(e, struct ip_vs_dest, d_list);
		if ((dest->addr == daddr)
		    && (dest->port == dport)
		    && ((dest->protocol == protocol) ||
			dest->vfwmark)) {
			/* HIT */
			read_unlock(&__ip_vs_rs_lock);
			return dest;
		}
	}
	read_unlock(&__ip_vs_rs_lock);

	return NULL;
}

/*
 *  Lookup destination by {addr,port} in the given service
 */
struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
{
	struct ip_vs_dest *dest;
	struct list_head *l, *e;
	
	/*
	 * Find the destination for the given service
	 */
	l = &svc->destinations;
	for (e=l->next; e!=l; e=e->next) {
		dest = list_entry(e, struct ip_vs_dest, n_list);
		if ((dest->addr == daddr) && (dest->port == dport)) {
			/* HIT */
			return dest;
		}
	}
	
	return NULL;
}


/*
 *  Lookup dest by {svc,addr,port} in the destination trash.
 *  The destination trash is used to hold the destinations that are removed
 *  from the service table but are still referenced by some conn entries.
 *  The reason to add the destination trash is when the dest is temporary
 *  down (either by administrator or by monitor program), the dest can be
 *  picked back from the trash, the remaining connections to the dest can
 *  continue, and the counting information of the dest is also useful for
 *  scheduling.
 */
struct ip_vs_dest *
ip_vs_get_trash_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
{
	struct ip_vs_dest *dest;
	struct list_head *l, *e;

	/*
	 * Find the destination in trash
	 */

	l = &ip_vs_dest_trash;

	for (e=l->next; e!=l; e=e->next) {
		dest = list_entry(e, struct ip_vs_dest, n_list);
		IP_VS_DBG(1, "Destination %u/%u.%u.%u.%u:%u still in trash, refcnt=%d\n",
                          dest->vfwmark,
                          NIPQUAD(dest->addr), ntohs(dest->port),
                          atomic_read(&dest->refcnt));
		if (dest->addr == daddr &&
		    dest->port == dport &&
		    dest->vfwmark == svc->fwmark &&
		    dest->protocol == svc->protocol &&
		    (svc->fwmark ||
                     (dest->vaddr == svc->addr &&
                      dest->vport == svc->port))) {
			/* HIT */
			return dest;
		}

		/*
		 * Try to purge the destination from trash if not referenced
		 */
		if (atomic_read(&dest->refcnt) == 1) {
			IP_VS_DBG(1, "Removing destination %u/%u.%u.%u.%u:%u "
                                  "from trash\n",
				  dest->vfwmark,
				  NIPQUAD(dest->addr), ntohs(dest->port));
			e = e->prev;
			list_del(&dest->n_list);
			kfree_s(dest, sizeof(*dest));
		}
	}

	return NULL;
}


/*
 *  Update a destination in the given service
 */
void __ip_vs_update_dest(struct ip_vs_service *svc,
                         struct ip_vs_dest *dest,
			 struct ip_vs_rule_user *ur)
{
	int conn_flags;

	/*
	 *    Set the weight and the flags
	 */
	atomic_set(&dest->weight, ur->weight);
	conn_flags = ur->conn_flags;

	conn_flags |= IP_VS_CONN_F_INACTIVE;

	/*
	 *    Check if local node and update the flags
	 */
	if (inet_addr_type(ur->daddr) == RTN_LOCAL) {
		conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
			| IP_VS_CONN_F_LOCALNODE;
	}

	/*
	 *    Set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading
	 */
	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
		atomic_set(&dest->conn_flags,conn_flags);
	} else {
		atomic_set(&dest->conn_flags,conn_flags);
		/*
		 *    Put the real service in ip_vs_rtable if not present.
		 *    For now only for NAT!
		 */
		write_lock_bh(&__ip_vs_rs_lock);
		ip_vs_rs_hash(dest);
		write_unlock_bh(&__ip_vs_rs_lock);
	}

	/*
	 *    Set the dest status flags
	 */
	dest->flags |= IP_VS_DEST_F_AVAILABLE;
}

        
/*
 *  Create a destination for the given service
 */
struct ip_vs_dest *
ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_rule_user *ur)
{
	struct ip_vs_dest *dest;

	EnterFunction(1);

	dest = (struct ip_vs_dest*) kmalloc(sizeof(struct ip_vs_dest),
					GFP_ATOMIC);
	if (dest == NULL) {
		IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
		return NULL;
	}
	memset(dest, 0, sizeof(struct ip_vs_dest));

	dest->protocol = svc->protocol;
	dest->vaddr = svc->addr;
	dest->vport = svc->port;
	dest->vfwmark = svc->fwmark;
	dest->addr = ur->daddr;
	dest->port = ur->dport;

	atomic_set(&dest->activeconns, 0);
	atomic_set(&dest->inactconns, 0);
	atomic_set(&dest->refcnt, 0);

	INIT_LIST_HEAD(&dest->d_list);
	__ip_vs_update_dest(svc, dest, ur);

	LeaveFunction(1);

	return dest;
}


/*
 *  Add a destination into an existing service
 */
int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_rule_user *ur)
{
	struct ip_vs_dest *dest;
	__u32 daddr = ur->daddr;
	__u16 dport = ur->dport;

	EnterFunction(1);

	if (ur->weight < 0) {
		IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
		return -ERANGE;
	}

	/*
	 * Check if the dest already exists in the list
	 */
	dest = ip_vs_lookup_dest(svc, daddr, dport);
	if (dest != NULL) {
		IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
		return -EEXIST;
	}

	/*
	 * Check if the dest already exists in the trash and
	 * is from the same service
	 */
	dest = ip_vs_get_trash_dest(svc, daddr, dport);
	if (dest != NULL) {
		IP_VS_DBG(1, "Get destination %u.%u.%u.%u:%u from trash, "
			"refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
			NIPQUAD(daddr), ntohs(dport),
			atomic_read(&dest->refcnt),
			dest->vfwmark,
			NIPQUAD(dest->vaddr),
			ntohs(dest->vport));

		__ip_vs_update_dest(svc, dest, ur);
		/*
		 * Get the destination from the trash
		 */
		list_del(&dest->n_list);

		write_lock_bh(&__ip_vs_svc_lock);

		/*
		 * Wait until all svc users go away
		 */
		while (atomic_read(&svc->refcnt) > 1) {};

		list_add(&dest->n_list, &svc->destinations);

		write_unlock_bh(&__ip_vs_svc_lock);
		return 0;
	}

	/*
	 * Allocate and initialize the dest structure
	 */
	dest = ip_vs_new_dest(svc, ur);
	if (dest == NULL) {
		IP_VS_ERR("ip_vs_add_dest(): out of memory\n");
		return -ENOMEM;
	}

	/*
	 * Add the dest entry into the list
	 */
	atomic_inc(&dest->refcnt);

	write_lock_bh(&__ip_vs_svc_lock);

	/*
	 *	Wait until all svc users go away
	 */
	while (atomic_read(&svc->refcnt) > 1) {};

	list_add(&dest->n_list, &svc->destinations);

	write_unlock_bh(&__ip_vs_svc_lock);

        LeaveFunction(1);
        
	return 0;
}


/*
 *  Edit a destination in the given service
 */
int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_rule_user *ur)
{
	struct ip_vs_dest *dest;
	__u32 daddr = ur->daddr;
	__u16 dport = ur->dport;

	EnterFunction(1);

	if (ur->weight < 0) {
		IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
		return -ERANGE;
	}

	/*
	 *  Lookup the destination list
	 */
	dest = ip_vs_lookup_dest(svc, daddr, dport);
	if (dest == NULL) {
		IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
		return -ENOENT;
	}

	__ip_vs_update_dest(svc, dest, ur);

        LeaveFunction(1);
        
	return 0;
}


/*
 *  Delete a destination (must be already unlinked from the service)
 */

void __ip_vs_del_dest(struct ip_vs_dest *dest)
{
	/*
	 *  Remove it from the d-linked list with the real services.
	 */
	write_lock_bh(&__ip_vs_rs_lock);
	ip_vs_rs_unhash(dest);
	write_unlock_bh(&__ip_vs_rs_lock);

	/*
	 *  Decrease the refcnt of the dest, and free the dest
	 *  if nobody refers to it (refcnt=0). Otherwise, throw
	 *  the destination into the trash.
	 */
	if (atomic_dec_and_test(&dest->refcnt))
		kfree_s(dest, sizeof(*dest));
	else {
		IP_VS_DBG(1, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
                          NIPQUAD(dest->addr), ntohs(dest->port),
                          atomic_read(&dest->refcnt));
		list_add(&dest->n_list, &ip_vs_dest_trash);
		atomic_inc(&dest->refcnt);
	}
}


/*
 *  Unlink a destination from the given service
 */
void __ip_vs_unlink_dest(struct ip_vs_service *svc,struct ip_vs_dest *dest,
			int svcupd)
{
	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;

	/*
	 *  Remove it from the d-linked destination list.
	 */
	list_del(&dest->n_list);
	if (svcupd) {
		/*
		 *  Call the update_service function of its scheduler
		 */
		svc->scheduler->update_service(svc);
	}
}


int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_rule_user *ur)
{
	struct ip_vs_dest *dest;
	__u32 daddr = ur->daddr;
	__u16 dport = ur->dport;

	EnterFunction(1);

	/*
	 *    Lookup the destination list
	 */
	dest = ip_vs_lookup_dest(svc, daddr, dport);
	if (dest == NULL) {
		IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
		return -ENOENT;
	}

	write_lock_bh(&__ip_vs_svc_lock);

	/*
	 *	Wait until all svc users go away
	 */
	while (atomic_read(&svc->refcnt) > 1) {};

	/*
	 *	Unlink dest from the service
	 */
	__ip_vs_unlink_dest(svc,dest,1);
	write_unlock_bh(&__ip_vs_svc_lock);

	/*
	 *	Delete dest
	 */
	__ip_vs_del_dest(dest);

        LeaveFunction(1);

	return 0;
}


/*
 *  Add a service into the service hash table
 */
int ip_vs_add_service(struct ip_vs_rule_user *ur)
{
	__u16  protocol = ur->protocol;
	__u32 vaddr = ur->vaddr;
	__u16 vport = ur->vport;
	__u32 vfwmark = ur->vfwmark;

	int ret = 0;
	struct ip_vs_scheduler *sched;
	struct ip_vs_service *svc = NULL;

	MOD_INC_USE_COUNT;

	/*
	 * Lookup the scheduler, by 'ur->sched_name'
	 */
	sched = ip_vs_lookup_scheduler(ur->sched_name);
	if (sched == NULL) {
		IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n",
			   ur->sched_name);
		MOD_DEC_USE_COUNT;
		return -ENOENT;
	}

	svc = (struct ip_vs_service*)
		kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
	if (svc == NULL) {
		MOD_DEC_USE_COUNT;
		IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
		ret =  -ENOMEM;
		goto out_err;
	}
	memset(svc, 0, sizeof(struct ip_vs_service));

	svc->protocol = protocol;
	svc->addr = vaddr;
	svc->port = vport;
	svc->fwmark = vfwmark;
	svc->flags = ur->vs_flags;
	svc->timeout = ur->timeout;
	svc->netmask = ur->netmask;

	INIT_LIST_HEAD(&svc->destinations);
	svc->sched_lock = RW_LOCK_UNLOCKED;

	/*
	 *    Bind the scheduler
	 */
	ret = ip_vs_bind_scheduler(svc, sched);
	if (ret) {
		goto out_err;
	}

	/*
	 *    First svc user: the table with the virtual services
	 */
	atomic_set(&svc->refcnt,1);

	/*
	 *    Hash the service into the service table
	 */
	write_lock_bh(&__ip_vs_svc_lock);
	ip_vs_svc_hash(svc);
	write_unlock_bh(&__ip_vs_svc_lock);

        /*
         *    Update the virtual service counters
         */
        if (vport == FTPPORT)
                atomic_inc(&ip_vs_ftpsvc_counter);
        else if (vport == 0)
                atomic_inc(&ip_vs_nullsvc_counter);

	return 0;

  out_err:
	__MOD_DEC_USE_COUNT(sched->module);
	MOD_DEC_USE_COUNT;
	if (svc) {
		kfree_s(svc, sizeof(struct ip_vs_service));
	}
	return ret;
}


/*
 *	Edit a service and bind it with a new scheduler
 */
int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_rule_user *ur)
{
	struct ip_vs_scheduler *sched, *old_sched;

	/*
	 * Lookup the scheduler, by 'ur->sched_name'
	 */
	sched = ip_vs_lookup_scheduler(ur->sched_name);
	if (sched == NULL) {
		IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n",
			   ur->sched_name);
		return -ENOENT;
	}

	write_lock_bh(&__ip_vs_svc_lock);

	/*
	 * Wait until all svc users go away
	 */
	while (atomic_read(&svc->refcnt) > 1) {};

	/*
	 * Set the flags and timeout value
	 */
	svc->flags = ur->vs_flags | IP_VS_SVC_F_HASHED;
	svc->timeout = ur->timeout;
	svc->netmask = ur->netmask;

	/*
	 * Unbind the old scheduler
	 */
	old_sched = svc->scheduler;
	ip_vs_unbind_scheduler(svc);

	/*
	 * Bind the new scheduler
	 */
	ip_vs_bind_scheduler(svc, sched);
	/*
	** If ip_vs_bind_scheduler fails (which is
	** possible) we just crash!!! TODO: restore
	** the old scheduler or delete this service,
	** to release some memory :) The preferred
	** result is to return -ENOMEM to the user
	** without touching the svc!
	*/

	write_unlock_bh(&__ip_vs_svc_lock);

	if (old_sched) __MOD_DEC_USE_COUNT(old_sched->module);

	return 0;
}


/*
 *  Delete a service from the service list
 *  The service must be unlinked, unlocked and not referenced!
 */
int __ip_vs_del_service(struct ip_vs_service *svc)
{
	struct list_head *l;
	struct ip_vs_dest *dest;
	struct ip_vs_scheduler *old_sched;

	/*
	 * Unbind scheduler
	 */
	old_sched = svc->scheduler;
	ip_vs_unbind_scheduler(svc);
	if (old_sched)
                __MOD_DEC_USE_COUNT(old_sched->module);

	/*
	 *    Unlink the whole destination list
	 */
	l = &svc->destinations;
	while (l->next != l) {
		dest = list_entry(l->next, struct ip_vs_dest, n_list);
		__ip_vs_unlink_dest(svc, dest, 0);
		__ip_vs_del_dest(dest);
	}

        /*
         *    Update the virtual service counters
         */
        if (svc->port == FTPPORT)
                atomic_dec(&ip_vs_ftpsvc_counter);
        else if (svc->port == 0)
                atomic_dec(&ip_vs_nullsvc_counter);

        /*
	 * Free the service
	 */
	kfree_s(svc, sizeof(struct ip_vs_service));
	MOD_DEC_USE_COUNT;
	return 0;
}

/*
 *  Delete a service from the service list
 */
int ip_vs_del_service(struct ip_vs_service *svc)
{
	if (svc == NULL)
		return -EEXIST;

	/*
	 * Unhash it from the service table
	 */
	write_lock_bh(&__ip_vs_svc_lock);
	ip_vs_svc_unhash(svc);

	/*
	 * Wait until all svc users go away
	 */
	while (atomic_read(&svc->refcnt) > 1) {};

	write_unlock_bh(&__ip_vs_svc_lock);

	return __ip_vs_del_service(svc);
}


/*
 *  Flush all the virtual services
 */
int ip_vs_flush(void)
{
	int idx;
	struct ip_vs_service *svc;
	struct list_head *l;

	/*
	 * Flush the service table hashed by <protocol,addr,port>
	 */
	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
		l = &ip_vs_svc_table[idx];
		while (l->next != l) {
			svc = list_entry(l->next,struct ip_vs_service,s_list);
			write_lock_bh(&__ip_vs_svc_lock);
			ip_vs_svc_unhash(svc);
			/*
			 *	Wait until all svc users go away
			 */
			while (atomic_read(&svc->refcnt) > 1) {};
			__ip_vs_del_service(svc);
			write_unlock_bh(&__ip_vs_svc_lock);
		}
	}

	/*
	 * Flush the service table hashed by fwmark
	 */
	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
		l = &ip_vs_svc_fwm_table[idx];
		while (l->next != l) {
			svc = list_entry(l->next,struct ip_vs_service,f_list);
			write_lock_bh(&__ip_vs_svc_lock);
			ip_vs_svc_unhash(svc);
			/*
			 * Wait until all svc users go away
			 */
			while (atomic_read(&svc->refcnt) > 1) {};
			__ip_vs_del_service(svc);
			write_unlock_bh(&__ip_vs_svc_lock);
		}
	}

	return 0;
}


static int ip_vs_sysctl_defense_mode(ctl_table *ctl, int write,
	struct file * filp, void *buffer, size_t *lenp)
{
	int *valp = ctl->data;
	int val = *valp;
	int ret;

	ret = proc_dointvec(ctl, write, filp, buffer, lenp);
	if (write && (*valp != val)) {
		if ((*valp < 0) || (*valp > 3)) {
			/* Restore the correct value */
			*valp = val;
		} else {
			local_bh_disable();
			update_defense_level();
			local_bh_enable();
		}
	}
	return ret;
}

/*
 *      IPVS sysctl table
 */
struct ip_vs_sysctl_table {
	struct ctl_table_header *sysctl_header;
	ctl_table vs_vars[19];
	ctl_table vs_dir[2];
	ctl_table ipv4_dir[2];
	ctl_table root_dir[2];
};

#define NET_IPV4_VS              21

static struct ip_vs_sysctl_table ipv4_vs_table = {
        NULL,
	{{NET_IPV4_VS_DEBUG_LEVEL, "debug_level",
          &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL,
          &proc_dointvec},
         {NET_IPV4_VS_AMEMTHRESH, "amemthresh",
          &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL,
          &proc_dointvec},
         {NET_IPV4_VS_AMDROPRATE, "am_droprate",
          &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL,
          &proc_dointvec},
         {NET_IPV4_VS_DROP_ENTRY, "drop_entry",
          &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL,
          &ip_vs_sysctl_defense_mode},
         {NET_IPV4_VS_DROP_PACKET, "drop_packet",
          &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL,
          &ip_vs_sysctl_defense_mode},
         {NET_IPV4_VS_SECURE_TCP, "secure_tcp",
          &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL,
          &ip_vs_sysctl_defense_mode},
         {NET_IPV4_VS_TO_ES, "timeout_established",
          &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_SS, "timeout_synsent",
          &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_SR, "timeout_synrecv",
          &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_FW, "timeout_finwait",
          &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_TW, "timeout_timewait",
          &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_CL, "timeout_close",
          &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_CW, "timeout_closewait",
          &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_LA, "timeout_lastack",
          &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_LI, "timeout_listen",
          &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_SA, "timeout_synack",
          &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_UDP, "timeout_udp",
          &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {NET_IPV4_VS_TO_ICMP, "timeout_icmp",
          &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
          sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
         {0}},
        {{NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table.vs_vars},
         {0}},
        {{NET_IPV4, "ipv4", NULL, 0, 0555, ipv4_vs_table.vs_dir},
         {0}},
        {{CTL_NET, "net", NULL, 0, 0555, ipv4_vs_table.ipv4_dir},
         {0}}
};
        

/*
 *	Write the contents of the VS rule table to a PROCfs file.
 */
static inline char *ip_vs_fwd_name(unsigned flags)
{
        char *fwd;

        switch (flags & IP_VS_CONN_F_FWD_MASK) {
        case IP_VS_CONN_F_LOCALNODE:
                fwd = "Local";
                break;
        case IP_VS_CONN_F_TUNNEL:
                fwd = "Tunnel";
                break;
        case IP_VS_CONN_F_DROUTE:
                fwd = "Route";
                break;
        default:
                fwd = "Masq";
        }
        return fwd;
}

static int ip_vs_get_info(char *buf, char **start, off_t offset, int length)
{
        int idx;
        int len=0;
        off_t pos=0;
        int size;
        struct ip_vs_service *svc;
        struct ip_vs_dest *dest;
        struct list_head *l, *e, *p, *q;

        size = sprintf(buf+len,
                       "IP Virtual Server version %d.%d.%d (size=%d)\n"
                       "Prot LocalAddress:Port Scheduler Flags\n"
                       "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n",
                       NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
        pos += size;
        len += size;
        
        read_lock_bh(&__ip_vs_svc_lock);

        /* print the service table hashed by <protocol,addr,port> */
        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
                l = &ip_vs_svc_table[idx];
                for (e=l->next; e!=l; e=e->next) {
                        svc = list_entry(e, struct ip_vs_service, s_list);
                        size = sprintf(buf+len, "%s  %08X:%04X %s",
                                       vs_proto_name(svc->protocol),
                                       ntohl(svc->addr),
                                       ntohs(svc->port),
                                       svc->scheduler->name);
                        len += size;
                        pos += size;
                        
                        if (svc->flags & IP_VS_SVC_F_PERSISTENT)
                                size = sprintf(buf+len,
                                               " persistent %d %08X\n",
                                               svc->timeout,
                                               ntohl(svc->netmask));
                        else
                                size = sprintf(buf+len, "\n");
                        
                        len += size;
                        pos += size;

                        if (pos <= offset)
                                len=0;
                        if (pos >= offset+length)
                                goto done;
			       
                        p = &svc->destinations;
                        for (q=p->next; q!=p; q=q->next) {
                                dest = list_entry(q,struct ip_vs_dest,n_list);
                                size = sprintf(buf+len,
                                               "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
                                               ntohl(dest->addr),
                                               ntohs(dest->port),
                                               ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
                                               atomic_read(&dest->weight),
                                               atomic_read(&dest->activeconns),
                                               atomic_read(&dest->inactconns));
                                len += size;
                                pos += size;
                                
                                if (pos <= offset)
                                        len=0;
                                if (pos >= offset+length)
                                        goto done;
                        }
                }
        }

        /* print the service table hashed by fwmark */
        for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
                l = &ip_vs_svc_fwm_table[idx];
                for (e=l->next; e!=l; e=e->next) {
                        svc = list_entry(e, struct ip_vs_service, f_list);
                        size = sprintf(buf+len, "%s  %08X %s", "FWM",
                                       svc->fwmark, svc->scheduler->name);
                        len += size;
                        pos += size;
                        
                        if (svc->flags & IP_VS_SVC_F_PERSISTENT)
                                size = sprintf(buf+len,
                                               " persistent %d %08X\n",
                                               svc->timeout,
                                               ntohl(svc->netmask));
                        else
                                size = sprintf(buf+len, "\n");
                        
                        len += size;
                        pos += size;

                        if (pos <= offset)
                                len=0;
                        if (pos >= offset+length)
                                goto done;
			       
                        p = &svc->destinations;
                        for (q=p->next; q!=p; q=q->next) {
                                dest = list_entry(q,struct ip_vs_dest,n_list);
                                size = sprintf(buf+len,
                                               "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
                                               ntohl(dest->addr),
                                               ntohs(dest->port),
                                               ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
                                               atomic_read(&dest->weight),
                                               atomic_read(&dest->activeconns),
                                               atomic_read(&dest->inactconns));
                                len += size;
                                pos += size;
                                
                                if (pos <= offset)
                                        len=0;
                                if (pos >= offset+length)
                                        goto done;
                        }
                }
        }

  done:
        read_unlock_bh(&__ip_vs_svc_lock);
        
        *start = buf+len-(pos-offset);          /* Start of wanted data */
        len = pos-offset;
        if (len > length)
                len = length;
        if (len < 0)
                len = 0;
        
        return len;
}


static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len)
{
	int ret;
	struct ip_vs_rule_user *urule;
	struct ip_vs_service *svc = NULL;

	if (!capable(CAP_NET_ADMIN))
		return -EPERM;

	/*
	 * Check the size of mm, no overflow...
	 * len > 128000 is a sanity check.
	 */
	if (len < sizeof(struct ip_vs_rule_user) || len > 128000) {
		IP_VS_ERR("IP_VS_SO_INSERT/DELETE: len %u < %u\n",
			  len, sizeof(struct ip_vs_rule_user));
		return -EINVAL;
	} else if ((urule = kmalloc(len, GFP_KERNEL)) == NULL) {
		IP_VS_ERR("IP_VS_SO_INSERT/DELETE: oom for len %u\n", len);
		return -ENOMEM;
	} else if (copy_from_user(urule, user, len) != 0) {
		ret = -EFAULT;
		goto out_free;
	}

	MOD_INC_USE_COUNT;

	spin_lock(&__ip_vs_mutex);

	/*
	 * Flush all the virtual service...
	 */
	if (cmd == IP_VS_SO_SET_FLUSH) {
		ret = ip_vs_flush();
		goto out_unlock;
	}
	
	/*
	 * Check for valid protocol: TCP or UDP. Even
	 * for fwmark!=0
	 */
	if (urule->protocol!=IPPROTO_TCP && urule->protocol!=IPPROTO_UDP) {
		IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s",
			   ntohs(urule->protocol), NIPQUAD(urule->vaddr),
			   ntohs(urule->vport), urule->sched_name);
		ret = -EFAULT;
		goto out_unlock;
	}

	/*
	 * Lookup the exact service by (protocol, vaddr, vport)
	 */
	if (urule->vfwmark == 0)
		svc = __ip_vs_lookup_service(urule->protocol,
					     urule->vaddr, urule->vport);
	else
		svc = __ip_vs_lookup_svc_fwm(urule->vfwmark);

	if (svc)
                atomic_dec(&svc->refcnt);

	switch (cmd) {
	case IP_VS_SO_SET_ADD:
		if (svc != NULL)
			ret = -EEXIST;
		else
			ret = ip_vs_add_service(urule);
		break;
	case IP_VS_SO_SET_EDIT:
		if (svc == NULL || svc->protocol != urule->protocol)
			ret = -ESRCH;
		else
			ret = ip_vs_edit_service(svc, urule);
		break;
	case IP_VS_SO_SET_DEL:
		if (svc == NULL || svc->protocol != urule->protocol)
			ret =  -ESRCH;
		else
			ret = ip_vs_del_service(svc);
		break;
	case IP_VS_SO_SET_ADDDEST:
		if (svc == NULL || svc->protocol != urule->protocol)
			ret = -ESRCH;
		else
			ret = ip_vs_add_dest(svc, urule);
		break;
	case IP_VS_SO_SET_EDITDEST:
		if (svc == NULL || svc->protocol != urule->protocol)
			ret = -ESRCH;
		else
			ret = ip_vs_edit_dest(svc, urule);
		break;
	case IP_VS_SO_SET_DELDEST:
		if (svc == NULL || svc->protocol != urule->protocol)
			ret = -ESRCH;
		else
			ret = ip_vs_del_dest(svc, urule);
		break;
	default:
		ret = -EINVAL;
		break;
	}

  out_unlock:
	spin_unlock(&__ip_vs_mutex);
	MOD_DEC_USE_COUNT;
  out_free:
	kfree(urule);
	return ret;
}


static int
do_ip_vs_get_ctl(struct sock *sk, int cmd, void *user, int *len)
{
        static char buf[32];
        
	if (cmd != IP_VS_SO_GET_VERSION)
                return -EINVAL;

        sprintf(buf, "IPVS %d.%d.%d", NVERSION(IP_VS_VERSION_CODE));
        
	if (*len < sizeof(buf)+1)
		return -EINVAL;

	if (copy_to_user(user, buf, sizeof(buf)+1) != 0)
		return -EFAULT;

	*len = sizeof(buf)+1;
	return 0;
}


static struct nf_sockopt_ops ip_vs_sockopts = {
        { NULL, NULL }, PF_INET,
        IP_VS_BASE_CTL, IP_VS_SO_SET_MAX+1, do_ip_vs_set_ctl,
        IP_VS_BASE_CTL, IP_VS_SO_GET_MAX+1, do_ip_vs_get_ctl
};


int ip_vs_control_init(void)
{
        int ret;
        int idx;

        EnterFunction(1);
        
 	ret = nf_register_sockopt(&ip_vs_sockopts);
	if (ret) {
                IP_VS_ERR("cannot register sockopt.\n");
                return ret;
        }

        proc_net_create("ip_vs", 0, ip_vs_get_info);

        ipv4_vs_table.sysctl_header =
                register_sysctl_table(ipv4_vs_table.root_dir, 0);
        /*
         * Initilize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable,
         * ip_vs_schedulers and ip_vs_dest_trash.
         */
        for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
                INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
                INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
        }
        for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
                INIT_LIST_HEAD(&ip_vs_rtable[idx]);
        }

        LeaveFunction(1);
        return 0;
}

void ip_vs_control_cleanup(void)
{
        EnterFunction(1);
        unregister_sysctl_table(ipv4_vs_table.sysctl_header);
        proc_net_remove("ip_vs");
        nf_unregister_sockopt(&ip_vs_sockopts);
        LeaveFunction(1);
}
