/*
 * IPVS         An implementation of the IP virtual server support for the
 *              LINUX operating system.  IPVS is now implemented as a module
 *              over the Netfilter framework. IPVS can be used to build a
 *              high-performance and highly available server based on a
 *              cluster of servers.
 *
 * Version:     $Id: ip_vs_conn.c,v 1.5 2000/09/06 08:36:36 wensong Exp $
 *
 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
 *              Peter Kese <peter.kese@ijs.si>
 *              Julian Anastasov <uli@linux.tu-varna.acad.bg>
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or (at your option) any later version.
 *
 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
 *
 * Changes:
 *
 */

#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/vmalloc.h>
#include <linux/ip.h>
#include <linux/tcp.h>                  /* for tcphdr */
#include <linux/in.h>
#include <linux/proc_fs.h>              /* for proc_net_* */
#include <asm/softirq.h>                /* for local_bh_* */

#include "ip_vs.h"
//#include <net/ip_vs.h>


/*
 *  Connection hash table: for input and output packets lookups of IPVS
 */
struct list_head *ip_vs_table;

/*  SLAB cache for IPVS connections */
static kmem_cache_t *ip_vs_conn_cachep;

/*
 *  No client port connection couter
 */
atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);


/*
 * 	Set ip_vs_conn expiration (deletion) and adds timer,
 *	if timeout==0 cancel expiration.
 *	Warning: it does not check/delete previous timer!
 */
static inline void
__ip_vs_set_expire(struct ip_vs_conn *cp, unsigned long tout)
{
        if (tout) {
                cp->timer.expires = jiffies+tout;
                add_sltimer(&cp->timer);
        } else {
                del_sltimer(&cp->timer);
        }
}


/*
 *  Fine locking granularity for big connection hash table
 */
#define CT_LOCKARRAY_BITS  4
#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)

struct ip_vs_aligned_lock
{
	rwlock_t	l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));

/* lock array for conn table */
struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;

static inline void ct_read_lock(unsigned key) 
{
        read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}

static inline void ct_read_unlock(unsigned key) 
{
        read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}

static inline void ct_write_lock(unsigned key) 
{
        write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}

static inline void ct_write_unlock(unsigned key) 
{
        write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}

static inline void ct_read_lock_bh(unsigned key) 
{
        read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}

static inline void ct_read_unlock_bh(unsigned key) 
{
        read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}


/*
 *	Returns hash value for IPVS connection entry
 */
static inline unsigned 
ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
{
        unsigned addrh = ntohl(addr);
        
        return (proto^addrh^(addrh>>IP_VS_CONN_TAB_BITS)^ntohs(port))
                & IP_VS_CONN_TAB_MASK;
}


/*
 *	Hashes ip_vs_conn in ip_vs_table by proto,addr,port.
 *	returns bool success.
 */
int ip_vs_conn_hash(struct ip_vs_conn *cp)
{
        unsigned hash;

        if (cp->flags & IP_VS_CONN_F_HASHED) {
                IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
                          "called from %p\n", __builtin_return_address(0));
                return 0;
        }

        /*
         *	Hash by proto,d{addr,port},
         *      which are client address and port in IPVS.
         */
        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);

        ct_write_lock(hash);
        
        list_add(&cp->c_list, &ip_vs_table[hash]);
        cp->flags |= IP_VS_CONN_F_HASHED;
        atomic_inc(&cp->refcnt);
        
        ct_write_unlock(hash);
        
        return 1;
}


/*
 *	UNhashes ip_vs_conn from ip_vs_table.
 *	should be called with locked tables.
 *	returns bool success.
 */
int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
        unsigned hash;

        if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
                IP_VS_ERR("ip_vs_conn_unhash(): request for unhash flagged, "
                          "called from %p\n", __builtin_return_address(0));
                return 0;
        }

        /*
         * Remove it from the list and decrease its reference counter.
         */
        hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
        ct_write_lock(hash);
        
        list_del(&cp->c_list);
        cp->flags &= ~IP_VS_CONN_F_HASHED;
        atomic_dec(&cp->refcnt);

        ct_write_unlock(hash);

        return 1;
}


/*
 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_table.
 *  Called for pkts coming from OUTside-to-INside.
 *	s_addr, s_port: pkt source address (foreign host)
 *	d_addr, d_port: pkt dest address (load balancer)
 *  Caller must lock tables
 */
static inline struct ip_vs_conn *__ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
        unsigned hash;
        struct ip_vs_conn *cp;
        struct list_head *l,*e;

        hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);

        ct_read_lock(hash);
        
        l = &ip_vs_table[hash];
        for (e=l->next; e!=l; e=e->next) {
                cp = list_entry(e, struct ip_vs_conn, c_list);
                if (s_addr==cp->caddr && s_port==cp->cport && 
                    d_port==cp->vport && d_addr==cp->vaddr &&
                    protocol==cp->protocol) {
                        /* HIT */
                        atomic_inc(&cp->refcnt);
                        __ip_vs_set_expire(cp, 0);
                        ct_read_unlock(hash);
                        return cp;
                }
        }

        ct_read_unlock(hash);

        return NULL;
}

struct ip_vs_conn *ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
        struct ip_vs_conn *ret;

        ret = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
        if (!ret && atomic_read(&ip_vs_conn_no_cport_cnt))
                ret = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
        
        IP_VS_DBG(2, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
                  vs_proto_name(protocol),
                  NIPQUAD(s_addr), ntohs(s_port),
                  NIPQUAD(d_addr), ntohs(d_port),
                  ret?"hit":"not hit");

        return ret;
}


/*
 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_table.
 *  Called for pkts coming from inside-to-OUTside.
 *	s_addr, s_port: pkt source address (inside host)
 *	d_addr, d_port: pkt dest address (foreign host)
 *  Caller must lock tables
 */
struct ip_vs_conn *ip_vs_conn_out_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
        unsigned hash;
        struct ip_vs_conn *cp, *ret=NULL;
        struct list_head *l,*e;

        /*	
         *	Check for "full" addressed entries
         */
        hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);

        ct_read_lock(hash);
        
        l = &ip_vs_table[hash];
        for (e=l->next; e!=l; e=e->next) {
                cp = list_entry(e, struct ip_vs_conn, c_list);
                if (d_addr == cp->caddr && d_port == cp->cport &&
                    s_port == cp->dport && s_addr == cp->daddr &&
                    protocol == cp->protocol) {
                        /* HIT */
                        atomic_inc(&cp->refcnt);
                        __ip_vs_set_expire(cp, 0);
                        ret = cp;
                        break;
                }
        }

        ct_read_unlock(hash);
        
        IP_VS_DBG(2, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
                  vs_proto_name(protocol),
                  NIPQUAD(s_addr), ntohs(s_port),
                  NIPQUAD(d_addr), ntohs(d_port),
                  ret?"hit":"not hit");

        return ret;
}


void ip_vs_conn_put(struct ip_vs_conn *cp)
{
	/*
	 *	Decrement refcnt
	 */
	__ip_vs_conn_put(cp);

	/*
	 *	if refcnt==1 (only referenced by the conn table
	 */
	if (atomic_read(&cp->refcnt) == 1) {
		__ip_vs_set_expire(cp, cp->timeout);
	} else {
		IP_VS_DBG(0, "did not set timer with refcnt=%d, "
                          "called from %p\n",
                          atomic_read(&cp->refcnt),
                          __builtin_return_address(0));
	}
}


/*
 *	Timeout table[state]
 */
static struct ip_vs_timeout_table vs_timeout_table = {
	ATOMIC_INIT(0),	/* refcnt */
	0,		/* scale  */
	{
		[IP_VS_S_NONE]          =	30*60*HZ,
		[IP_VS_S_ESTABLISHED]	=	15*60*HZ,
		[IP_VS_S_SYN_SENT]	=	2*60*HZ,
		[IP_VS_S_SYN_RECV]	=	1*60*HZ,
		[IP_VS_S_FIN_WAIT]	=	2*60*HZ,
		[IP_VS_S_TIME_WAIT]	=	2*60*HZ,
		[IP_VS_S_CLOSE]         =	10*HZ,
		[IP_VS_S_CLOSE_WAIT]	=	60*HZ,
		[IP_VS_S_LAST_ACK]	=	30*HZ,
		[IP_VS_S_LISTEN]	=	2*60*HZ,
		[IP_VS_S_SYNACK]	=	120*HZ,
		[IP_VS_S_UDP]		=	5*60*HZ,
		[IP_VS_S_ICMP]          =	1*60*HZ,
		[IP_VS_S_LAST]          =	2*HZ,
	},	/* timeout */
};


struct ip_vs_timeout_table vs_timeout_table_dos = {
	ATOMIC_INIT(0),	/* refcnt */
	0,		/* scale  */
	{
		[IP_VS_S_NONE]          =	15*60*HZ,
		[IP_VS_S_ESTABLISHED]	=	8*60*HZ,
		[IP_VS_S_SYN_SENT]	=	60*HZ,
		[IP_VS_S_SYN_RECV]	=	10*HZ,
		[IP_VS_S_FIN_WAIT]	=	60*HZ,
		[IP_VS_S_TIME_WAIT]	=	60*HZ,
		[IP_VS_S_CLOSE]         =	10*HZ,
		[IP_VS_S_CLOSE_WAIT]	=	60*HZ,
		[IP_VS_S_LAST_ACK]	=	30*HZ,
		[IP_VS_S_LISTEN]	=	2*60*HZ,
		[IP_VS_S_SYNACK]	=	100*HZ,
		[IP_VS_S_UDP]		=	3*60*HZ,
		[IP_VS_S_ICMP]          =	1*60*HZ,
		[IP_VS_S_LAST]          =	2*HZ,
	},	/* timeout */
};

/*
 *	Timeout table to use for the VS entries
 *	If NULL we use the default table (vs_timeout_table).
 *	Under flood attack we switch to vs_timeout_table_dos
 */

static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table;


static const char * state_name_table[IP_VS_S_LAST+1] = {
	[IP_VS_S_NONE]          =	"NONE",
	[IP_VS_S_ESTABLISHED]	=	"ESTABLISHED",
	[IP_VS_S_SYN_SENT]	=	"SYN_SENT",
	[IP_VS_S_SYN_RECV]	=	"SYN_RECV",
	[IP_VS_S_FIN_WAIT]	=	"FIN_WAIT",
	[IP_VS_S_TIME_WAIT]	=	"TIME_WAIT",
	[IP_VS_S_CLOSE]         =	"CLOSE",
	[IP_VS_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
	[IP_VS_S_LAST_ACK]	=	"LAST_ACK",
	[IP_VS_S_LISTEN]	=	"LISTEN",
	[IP_VS_S_SYNACK]	=	"SYNACK",
	[IP_VS_S_UDP]		=	"UDP",
	[IP_VS_S_ICMP]          =	"ICMP",
	[IP_VS_S_LAST]          =	"BUG!",
};

#define sNO IP_VS_S_NONE
#define sES IP_VS_S_ESTABLISHED
#define sSS IP_VS_S_SYN_SENT
#define sSR IP_VS_S_SYN_RECV
#define sFW IP_VS_S_FIN_WAIT
#define sTW IP_VS_S_TIME_WAIT
#define sCL IP_VS_S_CLOSE
#define sCW IP_VS_S_CLOSE_WAIT
#define sLA IP_VS_S_LAST_ACK
#define sLI IP_VS_S_LISTEN
#define sSA IP_VS_S_SYNACK

struct vs_tcp_states_t {
	int next_state[IP_VS_S_LAST];	/* should be _LAST_TCP */
};

const char * ip_vs_state_name(int state)
{
	if (state >= IP_VS_S_LAST)
		return "ERR!";
	return state_name_table[state] ? state_name_table[state] : "?";
}

static struct vs_tcp_states_t vs_tcp_states [] = {
/*	INPUT */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},

/*	OUTPUT */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},

/*	INPUT-ONLY */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};

static struct vs_tcp_states_t vs_tcp_states_dos [] = {
/*	INPUT */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},

/*	OUTPUT */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},

/*	INPUT-ONLY */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};

static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states;

void ip_vs_secure_tcp_set(int on)
{
	if (on) {
		ip_vs_state_table = vs_tcp_states_dos;
		ip_vs_timeout_table = &vs_timeout_table_dos;
	} else {
		ip_vs_state_table = vs_tcp_states;
		ip_vs_timeout_table = &vs_timeout_table;
	}
}


static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off) 
{
	/*
	 *	[0-3]: input states, [4-7]: output, [8-11] input only states.
	 */
	if (th->rst)
		return state_off+3;
	if (th->syn)
		return state_off+0;
	if (th->fin)
		return state_off+1;
	if (th->ack)
		return state_off+2;
	return -1;
}


static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state)
{
	struct ip_vs_timeout_table *vstim = cp->timeout_table;
	int scale;

	/*
	 *	Use default timeout table if no specific for this entry
	 */
	if (!vstim) 
		vstim = &vs_timeout_table;

	cp->timeout = vstim->timeout[cp->state=state];
	scale = vstim->scale;

	if (scale<0)
		cp->timeout >>= -scale;
	else if (scale > 0)
		cp->timeout <<= scale;

	return state;
}


static inline int
vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th)
{
	int state_idx;
	int new_state = IP_VS_S_CLOSE;

	/*
         *    Update state offset to INPUT_ONLY if necessary
         *    or delete NO_OUTPUT flag if output packet detected
         */
	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
		if (state_off == VS_STATE_OUTPUT)
			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
		else
                        state_off = VS_STATE_INPUT_ONLY;
	} 

	if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) {
		IP_VS_DBG(1, "vs_tcp_state_idx(%d)=%d!!!\n",
                          state_off, state_idx);
		goto tcp_state_out;
	}

	new_state = ip_vs_state_table[state_idx].next_state[cp->state];

  tcp_state_out:
	if (new_state != cp->state) {
                struct ip_vs_dest *dest = cp->dest;

                IP_VS_DBG(1, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
                          "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
                          vs_proto_name(cp->protocol),
                          (state_off==VS_STATE_OUTPUT)?"output ":"input ",
                          th->syn? 'S' : '.',
                          th->fin? 'F' : '.',
                          th->ack? 'A' : '.',
                          th->rst? 'R' : '.',
                          NIPQUAD(cp->daddr), ntohs(cp->dport),
                          NIPQUAD(cp->caddr), ntohs(cp->cport),
                          ip_vs_state_name(cp->state),
                          ip_vs_state_name(new_state),
                          atomic_read(&cp->refcnt));
                if (dest) {
                        if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
                            (new_state != IP_VS_S_ESTABLISHED)) {
                                atomic_dec(&dest->activeconns);
                                atomic_inc(&dest->inactconns);
                                cp->flags |= IP_VS_CONN_F_INACTIVE;
                        } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
                                   (new_state == IP_VS_S_ESTABLISHED)) {
                                atomic_inc(&dest->activeconns);
                                atomic_dec(&dest->inactconns);
                                cp->flags &= ~IP_VS_CONN_F_INACTIVE;
                        }
                }
	}

	return vs_set_state_timeout(cp, new_state);
}


/*
 *	Handle state transitions
 */
int vs_set_state(struct ip_vs_conn *cp,
                 int state_off, struct iphdr *iph, void *tp)
{
	switch (iph->protocol) {
		case IPPROTO_ICMP:
			return vs_set_state_timeout(cp, IP_VS_S_ICMP);
		case IPPROTO_UDP:
			return vs_set_state_timeout(cp, IP_VS_S_UDP);
		case IPPROTO_TCP:
			return vs_tcp_state(cp, state_off, tp);
	}
	return -1;
}


/*
 *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
 */
int ip_vs_conn_listen(struct ip_vs_conn *cp)
{
	vs_set_state_timeout(cp, IP_VS_S_LISTEN);
	return cp->timeout;
}


/*
 *  Bind a connection entry with a virtual service destination
 *  Called when a new connection entry is created for VS.
 */
void ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
        cp->flags |= atomic_read(&dest->conn_flags);
        cp->dest = dest;

        /*
         *    Increase the refcnt counter of the dest.
         */
        atomic_inc(&dest->refcnt);

        IP_VS_DBG(1, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
                  "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
                  vs_proto_name(cp->protocol),
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
                  NIPQUAD(cp->daddr), ntohs(cp->dport),
                  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
                  cp->flags, atomic_read(&cp->refcnt),
                  atomic_read(&dest->refcnt));
}


/*
 *  Unbind a connection entry with its VS destination
 *  Called by the connection_expire function.
 */
void ip_vs_unbind_dest(struct ip_vs_conn *cp)
{
        struct ip_vs_dest *dest = cp->dest;
        
        IP_VS_DBG(1, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
                  "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
                  vs_proto_name(cp->protocol),
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
                  NIPQUAD(cp->daddr), ntohs(cp->dport),
                  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
                  cp->flags, atomic_read(&cp->refcnt),
                  atomic_read(&dest->refcnt));

        if (dest) {
                /*
                 * Decrease the inactconns or activeconns counter
                 * if it is not a connection template (cp->cport!=0).
                 */
                if (cp->cport) {
                        if (cp->flags & IP_VS_CONN_F_INACTIVE) {
                                atomic_dec(&dest->inactconns);
                        } else {
                                atomic_dec(&dest->activeconns);
                        }
                }
                
                /*
                 *  Decrease the refcnt of the dest, and free the dest
                 *  if nobody refers to it (refcnt=0).
                 */
                if (atomic_dec_and_test(&dest->refcnt))
                        kfree(dest);
        }
}


/*
 *  Checking if the destination of a connection template is available.
 *  If available, return 1, otherwise return 0 and invalidate this
 *  connection template.
 */
int ip_vs_check_template(struct ip_vs_conn *ct)
{
        struct ip_vs_dest *dest = ct->dest;
                        
        /*
         * Checking the dest server status.
         */
        if ((dest == NULL) ||
            !(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
                IP_VS_DBG(1, "check_template: dest not available for "
                          "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
                          "-> d:%u.%u.%u.%u:%d\n",
                          vs_proto_name(ct->protocol),
                          NIPQUAD(ct->caddr), ntohs(ct->cport),
                          NIPQUAD(ct->vaddr), ntohs(ct->vport),
                          NIPQUAD(ct->daddr), ntohs(ct->dport));

                /*
                 * Invalidate the connection template
                 */
                ip_vs_conn_unhash(ct);
                ct->dport = 65535;
                ct->vport = 65535;
                ct->cport = 0;
                ip_vs_conn_hash(ct);
                
                /*
                 * Simply decrease the refcnt of the template,
                 * don't restart its timer.
                 */
                atomic_dec(&ct->refcnt);
                return 0;
        }
        return 1;
}


static inline void
ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim)
{
	atomic_inc(&vstim->refcnt);
	cp->timeout_table = vstim;
}

static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp)
{
	struct ip_vs_timeout_table *vstim = cp->timeout_table;

	if (!vstim)
		return;
	cp->timeout_table = NULL;
	atomic_dec(&vstim->refcnt);
}


static void ip_vs_conn_expire(unsigned long data)
{
	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;

	if (cp->timeout_table)
                cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT];
        else
                cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT];

	/*
	 *	hey, I'm using it
	 */
	atomic_inc(&cp->refcnt);

	/*
	 * 	do I control anybody?
	 */
	if (atomic_read(&cp->n_control)) 
		goto expire_later;

	/* 	
	 *	does anybody controls me?
	 */
	if (cp->control)
		ip_vs_control_del(cp);

        if (ip_vs_conn_unhash(cp)) {
                ip_vs_unbind_dest(cp);
                ip_vs_unbind_app(cp);
        }
        
	/*
	 *	refcnt==1 implies I'm the only one referrer
	 */
	if (atomic_read(&cp->refcnt) == 1) {
		ip_vs_timeout_detach(cp);
                if (cp->flags & IP_VS_CONN_F_NO_CPORT)
                        atomic_dec(&ip_vs_conn_no_cport_cnt);
		kmem_cache_free(ip_vs_conn_cachep, cp);
		/*  sysctl_ip_always_defrag--; */
		MOD_DEC_USE_COUNT;
		goto expire_out;
	}

  expire_later:
	IP_VS_DBG(1, "delayed: refcnt-1=%d conn.n_control=%d\n",
                  atomic_read(&cp->refcnt)-1,
                  atomic_read(&cp->n_control));

	ip_vs_conn_put(cp);

  expire_out:
}


/*
 *  Create a new connection entry for IPVS and hash it into the ip_vs_table.
 */
struct ip_vs_conn *
ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
               __u32 daddr, __u16 dport, unsigned flags)
{
        struct ip_vs_conn *cp;

	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
        if (cp == NULL) {
                if (net_ratelimit())
                        IP_VS_ERR("ip_vs_conn_new: no memory available.\n");
                return NULL;
        }
	MOD_INC_USE_COUNT;
        
        memset(cp, 0, sizeof(*cp));
	INIT_LIST_HEAD(&cp->c_list);
	init_timer(&cp->timer);
	cp->timer.data     = (unsigned long)cp;
	cp->timer.function = ip_vs_conn_expire;
	ip_vs_timeout_attach(cp, ip_vs_timeout_table);
        cp->protocol	   = proto;
        cp->caddr    	   = caddr;
        cp->cport	   = cport;
        cp->vaddr	   = vaddr;
        cp->vport	   = vport;
        cp->daddr          = daddr;
        cp->dport          = dport;
        cp->flags	   = flags;
        cp->app_data	   = NULL;
        cp->control	   = NULL;
	
	atomic_set(&cp->n_control, 0);
	atomic_set(&cp->in_pkts, 0);

        /*
         * Set the entry is referenced by the current thread before hashing
         * it in the table, so that other thread run ip_vs_random_dropentry
         * but cannot drop this entry.
         */
	atomic_set(&cp->refcnt, 1);

        /*  Hash it in the ip_vs_table */
        ip_vs_conn_hash(cp);

        if (flags & IP_VS_CONN_F_NO_CPORT)
                atomic_inc(&ip_vs_conn_no_cport_cnt);

        ip_vs_bind_app(cp);
        vs_set_state_timeout(cp, IP_VS_S_NONE);

        return cp;
}


/*
 *	/proc/net/ip_vs_conn entries
 */
static int
ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
{
	off_t pos=0, begin;
	struct ip_vs_conn *cp;
        int idx, size;
	int len=0;
	struct list_head *l, *e;

        size = sprintf(buffer, "Prc FromIP   FPrt ToIP     TPrt "
                       "DestIP   DPrt State       Expires\n");
        pos += size;
        len += size;

        for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
                /*
                 *	Lock is actually only need in next loop 
                 *	we are called from uspace: must stop bh.
                 */
                ct_read_lock_bh(idx);

                l = &ip_vs_table[idx];
                for (e=l->next; e!=l; e=e->next) {
                        cp = list_entry(e, struct ip_vs_conn, c_list);

                        size = sprintf(buffer+len,
                                       "%-3s %08X %04X %08X %04X "
                                       "%08X %04X %-11s %7lu\n",
                                       vs_proto_name(cp->protocol),
                                       ntohl(cp->caddr), ntohs(cp->cport),
                                       ntohl(cp->vaddr), ntohs(cp->vport),
                                       ntohl(cp->daddr), ntohs(cp->dport),
                                       ip_vs_state_name(cp->state),
                                       cp->timer.expires-jiffies);
                        len += size;
                        pos += size;
                        if (pos <= offset)
                                len=0;
                        if (pos >= offset+length) {
                                ct_read_unlock_bh(idx);
                                goto done;
                        }
                }
                ct_read_unlock_bh(idx);
	}

  done:
	begin = len - (pos - offset);
	*start = buffer + begin;
	len -= begin;
	if(len>length)
		len = length;
	return len;
}


/*
 *      Randomly drop connection entries before running out of memory
 */
static inline int todrop_entry(struct ip_vs_conn *cp)
{
	/*
	 * The drop rate array needs tuning for real environments.
	 * Called from timer bh only => no locking
	 */
	static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
	static char todrop_counter[9] = {0};
	int i;

	/* !!! TODO: cp->timeout locking */
	if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
		return 0;

	i = atomic_read(&cp->in_pkts);
	if (i > 8) return 0;

	if (!todrop_rate[i]) return 0;
	if (--todrop_counter[i] > 0) return 0;

	todrop_counter[i] = todrop_rate[i];
	return 1;
}


void ip_vs_random_dropentry(void)
{
	int idx;
	struct ip_vs_conn *cp;
	struct list_head *l,*e;
	struct ip_vs_conn *ct;
	void (*fn)(unsigned long);

	/*
	 * Randomly scan 1/32 of the whole table every second
	 */
	for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) {
                unsigned hash = net_random()&IP_VS_CONN_TAB_MASK;

		/*
		 *  Lock is actually needed in this loop.
		 */
		ct_write_lock(hash);
		
		l = &ip_vs_table[hash];
		for (e=l->next; e!=l; e=e->next) {
			cp = list_entry(e, struct ip_vs_conn, c_list);
			if (cp->cport == 0)
				/* connection template */
				continue;
			switch(cp->state) {
			case IP_VS_S_SYN_RECV:
			case IP_VS_S_SYNACK:
				break;
				
			case IP_VS_S_ESTABLISHED:
			case IP_VS_S_UDP:
				if (todrop_entry(cp))
					break;
				continue;

			default:
				continue;
			}

			/*
			 * Drop the entry, and drop its ct if not referenced
			 */
			ct_write_unlock(hash);
			IP_VS_DBG(1, "Drop connection\n");
			ct = cp->control;
			fn = (cp->timer).function;
			if (!del_sltimer(&cp->timer))
				fn((unsigned long)cp);
			if (ct && !atomic_read(&ct->n_control)) {
				IP_VS_DBG(1, "Drop connection template\n");
				del_sltimer(&ct->timer);
				fn((unsigned long)ct);
			}
			ct_write_lock(hash);
		}
		ct_write_unlock(hash);
	}
}
		

int ip_vs_conn_init(void)
{
        int idx;
        
        /*
         * Allocate the connection hash table and initialize its list heads
         */
        ip_vs_table = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
        if (!ip_vs_table) {
                return -ENOMEM;
        }

        /* Allocate ip_vs_conn slab cache */
        ip_vs_conn_cachep = kmem_cache_create("ip_vs",
                                              sizeof(struct ip_vs_conn), 0,
                                              SLAB_HWCACHE_ALIGN, NULL, NULL);
        if (!ip_vs_conn_cachep) {
                vfree(ip_vs_table);
                return -ENOMEM;
        }
        
        IP_VS_INFO("Connection hash table configured "
                   "(size=%d, memory=%ldKbytes)\n",
                   IP_VS_CONN_TAB_SIZE,
                   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
        IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
                  sizeof(struct ip_vs_conn));

        for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++)  {
                INIT_LIST_HEAD(&ip_vs_table[idx]);
        }

        for(idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
                __ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
        }

        proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);

        return 0;
}

void ip_vs_conn_cleanup(void)
{
	/* Release the empty cache */
	kmem_cache_destroy(ip_vs_conn_cachep);
        proc_net_remove("ip_vs_conn");
        vfree(ip_vs_table);
}
