lvs是基于netfilter框架实现的四层负载均衡器,包含两部分,一部分是用户态的ipvsadm配置管理命令,另一部分是内核态的核心ko。
lvs常用的转发模式是DR,tunnel和dnat。DR和tunnel模式下,只有请求报文会经过lvs,响应报文由rs直接返给client。dnat模式下,需要有端口映射,所以响应报文也必须经过lvs做snat后才能发给client。
lvs使用调度器选择合适的rs,可在代码中搜register_ip_vs_scheduler查看支持的调度器。
模块初始化
module_init(ip_vs_init);
static int __init ip_vs_init(void)
{
int ret;
ret = ip_vs_control_init();
if (ret < 0) {
pr_err("can't setup control.\n");
goto exit;
}
ip_vs_protocol_init();
ret = ip_vs_conn_init();
if (ret < 0) {
pr_err("can't setup connection table.\n");
goto cleanup_protocol;
}
ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
if (ret < 0)
goto cleanup_conn;
ret = register_pernet_device(&ipvs_core_dev_ops);
if (ret < 0)
goto cleanup_sub;
ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
if (ret < 0) {
pr_err("can't register hooks.\n");
goto cleanup_dev;
}
ret = ip_vs_register_nl_ioctl();
if (ret < 0) {
pr_err("can't register netlink/ioctl.\n");
goto cleanup_hooks;
}
pr_info("ipvs loaded.\n");
return ret;
cleanup_hooks:
nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
cleanup_dev:
unregister_pernet_device(&ipvs_core_dev_ops);
cleanup_sub:
unregister_pernet_subsys(&ipvs_core_ops);
cleanup_conn:
ip_vs_conn_cleanup();
cleanup_protocol:
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
exit:
return ret;
}
1. ip_vs_control_init
/*
* Hash table: for virtual service lookups
*/
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by <protocol, addr, port> */
static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
int __init ip_vs_control_init(void)
{
int idx;
int ret;
EnterFunction(2);
初始化两张hash链表,hash桶大小为256
/* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
smp_wmb(); /* Do we really need it now ? */
注册网络设备状态变化通知函数,如果有网卡down了会调用ip_vs_dst_event,清除和此设备相关的表项
ret = register_netdevice_notifier(&ip_vs_dst_notifier);
if (ret < 0)
return ret;
LeaveFunction(2);
return 0;
}
2.ip_vs_protocol_init
将协议相关的ip_vs_protocol 注册到 ip_vs_proto_table
struct ip_vs_protocol ip_vs_protocol_tcp = {
.name = "TCP",
.protocol = IPPROTO_TCP,
.num_states = IP_VS_TCP_S_LAST,
.dont_defrag = 0,
.init = NULL,
.exit = NULL,
.init_netns = __ip_vs_tcp_init,
.exit_netns = __ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
.conn_in_get = ip_vs_conn_in_get_proto,
.conn_out_get = ip_vs_conn_out_get_proto,
.snat_handler = tcp_snat_handler,
.dnat_handler = tcp_dnat_handler,
.csum_check = tcp_csum_check,
.state_name = tcp_state_name,
.state_transition = tcp_state_transition,
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
};
#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
pp->next = ip_vs_proto_table[hash];
ip_vs_proto_table[hash] = pp;
if (pp->init != NULL)
pp->init(pp);
return 0;
}
int __init ip_vs_protocol_init(void)
{
char protocols[64];
#define REGISTER_PROTOCOL(p) \
do { \
register_ip_vs_protocol(p); \
strcat(protocols, ", "); \
strcat(protocols, (p)->name); \
} while (0)
protocols[0] = '\0';
protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
REGISTER_PROTOCOL(&ip_vs_protocol_udp);
#endif
#ifdef CONFIG_IP_VS_PROTO_SCTP
REGISTER_PROTOCOL(&ip_vs_protocol_sctp);
#endif
#ifdef CONFIG_IP_VS_PROTO_AH
REGISTER_PROTOCOL(&ip_vs_protocol_ah);
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
pr_info("Registered protocols (%s)\n", &protocols[2]);
return 0;
}
3. ip_vs_conn_init
#define CONFIG_IP_VS_TAB_BITS 12
static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
int __init ip_vs_conn_init(void)
{
int idx;
连接表大小为 1 <<12 = 4096
/* Compute size and mask */
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
/*
* Allocate the connection hash table and initialize its list heads
*/
分配4096个ip_vs_conn_tab
ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
if (!ip_vs_conn_tab)
return -ENOMEM;
/* Allocate ip_vs_conn slab cache */
ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
sizeof(struct ip_vs_conn), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!ip_vs_conn_cachep) {
vfree(ip_vs_conn_tab);
return -ENOMEM;
}
pr_info("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
ip_vs_conn_tab_size,
(long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
sizeof(struct ip_vs_conn));
for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
return 0;
}
4.注册pernet操作 ipvs_core_ops和ipvs_core_dev_ops
对每个net namespace都会调用 init 函数进行pernet的初始化。
ipvs_core_dev_ops 只提供了exit,只在卸载模块时调用。
static struct pernet_operations ipvs_core_ops = {
.init = __ip_vs_init,
.exit = __ip_vs_cleanup,
.id = &ip_vs_net_id,
.size = sizeof(struct netns_ipvs),
};
static struct pernet_operations ipvs_core_dev_ops = {
.exit = __ip_vs_dev_cleanup,
};
主要是初始化 netns_ipvs 结构体的字段
static int __net_init __ip_vs_init(struct net *net)
{
struct netns_ipvs *ipvs;
ipvs = net_generic(net, ip_vs_net_id);
if (ipvs == NULL)
return -ENOMEM;
/* Hold the beast until a service is registerd */
ipvs->enable = 0;
ipvs->net = net;
/* Counters used for creating unique names */
ipvs->gen = atomic_read(&ipvs_netns_cnt);
atomic_inc(&ipvs_netns_cnt);
net->ipvs = ipvs;
初始化和estimator相关的字段
if (ip_vs_estimator_net_init(net) < 0)
goto estimator_fail;
if (ip_vs_control_net_init(net) < 0)
goto control_fail;
if (ip_vs_protocol_net_init(net) < 0)
goto protocol_fail;
if (ip_vs_app_net_init(net) < 0)
goto app_fail;
if (ip_vs_conn_net_init(net) < 0)
goto conn_fail;
if (ip_vs_sync_net_init(net) < 0)
goto sync_fail;
printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
sizeof(struct netns_ipvs), ipvs->gen);
return 0;
/*
* Error handling
*/
sync_fail:
ip_vs_conn_net_cleanup(net);
conn_fail:
ip_vs_app_net_cleanup(net);
app_fail:
ip_vs_protocol_net_cleanup(net);
protocol_fail:
ip_vs_control_net_cleanup(net);
control_fail:
ip_vs_estimator_net_cleanup(net);
estimator_fail:
net->ipvs = NULL;
return -ENOMEM;
}
5. 注册hook函数到netfilter框架
static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 2,
},
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
.hook = ip_vs_remote_request4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 2,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
}
nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
6. ip_vs_register_nl_ioctl
注册两种用户态和内核态通信的方法:sockopt和netlink,ipvsadm可以使用两者之一下发命令
int __init ip_vs_register_nl_ioctl(void)
{
int ret;
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
goto err_sock;
}
ret = ip_vs_genl_register();
if (ret) {
pr_err("cannot register Generic Netlink interface.\n");
goto err_genl;
}
return 0;
err_genl:
nf_unregister_sockopt(&ip_vs_sockopts);
err_sock:
return ret;
}
注册调度器
调用 register_ip_vs_scheduler 将调度器 注册到链表 ip_vs_schedulers。
3.18.79内核实现中,已经有了十几种调度器的实现,每一种调度器都是以module形式存在,加载module时进行初始化,注册调度器。
int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
{
struct ip_vs_scheduler *sched;
if (!scheduler) {
pr_err("%s(): NULL arg\n", __func__);
return -EINVAL;
}
if (!scheduler->name) {
pr_err("%s(): NULL scheduler_name\n", __func__);
return -EINVAL;
}
/* increase the module use count */
ip_vs_use_count_inc();
mutex_lock(&ip_vs_sched_mutex);
if (!list_empty(&scheduler->n_list)) {
mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already linked\n",
__func__, scheduler->name);
return -EINVAL;
}
/*
* Make sure that the scheduler with this name doesn't exist
* in the scheduler list.
*/
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already existed "
"in the system\n", __func__, scheduler->name);
return -EINVAL;
}
}
/*
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
mutex_unlock(&ip_vs_sched_mutex);
pr_info("[%s] scheduler registered.\n", scheduler->name);
return 0;
}
下面为sh(source ip hash)调度器,后面的分析也会以sh为例
module_init(ip_vs_sh_init);
static struct ip_vs_scheduler ip_vs_sh_scheduler =
{
.name = "sh",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
.init_service = ip_vs_sh_init_svc,
.done_service = ip_vs_sh_done_svc,
.add_dest = ip_vs_sh_dest_changed,
.del_dest = ip_vs_sh_dest_changed,
.upd_dest = ip_vs_sh_dest_changed,
.schedule = ip_vs_sh_schedule,
};
static int __init ip_vs_sh_init(void)
{
return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
}
创建service,添加rs
ipvs的基本用法如下,首先创建一个service,指定协议,ip和端口号,指定调度算法。再给这个service添加真实的后端ip和端口,并指定转发模式。
这里的service是对外提供的,收到client的请求后(目的ip为service ip),根据调度算法从后端ip池找到合适的真实server,根据转发模式发送到真实server。
ipvsadm -C
//添加虚拟service,调度算法为sh,-t表示tcp
ipvsadm -A -t 1.1.1.10:8080 -s sh
//给service添加真实ip:port,-g表示工作模式为DR直接路由
ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.4:8080 -g
ipvsadm -ln
下面分别看一下创建servcie和添加rs的代码流程
- 创建service
这个结构体是用户态命令行设置规则并下发给kernel的
struct ip_vs_service_user_kern {
/* virtual service addresses */
u16 af;
u16 protocol;
union nf_inet_addr addr; /* virtual ip address */
__be16 port;
u32 fwmark; /* firwall mark of service */
/* virtual service options */
char *sched_name;
char *pe_name;
unsigned int flags; /* virtual service flags */
unsigned int timeout; /* persistent timeout in sec */
__be32 netmask; /* persistent netmask or plen */
};
static int
ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
int ret = 0, i;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
根据指定的调度器名字查找,如果第一次查找失败,会尝试自动加载module,再次查找。两次查找都失败就返回报错
/* Lookup the scheduler by 'u->sched_name' */
if (strcmp(u->sched_name, "none")) {
sched = ip_vs_scheduler_get(u->sched_name);
if (!sched) {
pr_info("Scheduler module ip_vs_%s not found\n",
u->sched_name);
ret = -ENOENT;
goto out_err;
}
}
根据pe name获取pe
if (u->pe_name && *u->pe_name) {
pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
ret = -ENOENT;
goto out_err;
}
}
分配service结构体
svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
if (svc == NULL) {
IP_VS_DBG(1, "%s(): no memory\n", __func__);
ret = -ENOMEM;
goto out_err;
}
分配percpu统计结构
svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
if (!svc->stats.cpustats) {
ret = -ENOMEM;
goto out_err;
}
初始化percpu 统计
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *ip_vs_stats;
ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
u64_stats_init(&ip_vs_stats->syncp);
}
/* I'm the first user of the service */
atomic_set(&svc->refcnt, 0);
将命令行设置的内容赋给service
svc->af = u->af;
svc->protocol = u->protocol;
ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
svc->port = u->port;
svc->fwmark = u->fwmark;
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
svc->net = net;
初始化destinations链表,用于存放rs
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
if (sched) {
将svc和指定的调度器绑定,即调用调度器的init_service函数,并将sched赋值给svc->scheduler。
init_service函数会分配每个sched私有结构体,存放在svc->sched_data
ret = ip_vs_bind_scheduler(svc, sched);
if (ret)
goto out_err;
sched = NULL;
}
/* Bind the ct retriever */
RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
将svc->stats放入链表ipvs->est_list
ip_vs_start_estimator(net, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
增加service个数
ipvs->num_services++;
将svc添加到全局变量 ip_vs_svc_table 或者ip_vs_svc_fwm_table
Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
Hash it by fwmark in svc_fwm_table
/* Hash the service into the service table */
ip_vs_svc_hash(svc);
*svc_p = svc;
/* Now there is a service - full throttle */
只要有一个service,就会设置enable
ipvs->enable = 1;
return 0;
out_err:
if (svc != NULL) {
ip_vs_unbind_scheduler(svc, sched);
ip_vs_service_free(svc);
}
ip_vs_scheduler_put(sched);
ip_vs_pe_put(pe);
/* decrease the module use count */
ip_vs_use_count_dec();
return ret;
}
- 添加rs
同样的,使用下面结构体传递参数。
conn_flags 指定了转发模式,比如DR,tunnel等,如果命令行不指定,默认为DR模式。
weight 表示此rs的权重,值越大被选中的概率越大。如果命令行不指定,默认为1。
port为命令行指定的端口号,但是如果是DR或者tunnel模式,会自动转换成service的port,因为DR或者tunnel模式下,不会修改数据包的端口号。
u_threshold 如果连接数超过此值会给此dest设置IP_VS_DEST_F_OVERLOAD标志,表示此dest负载太多,默认值为0。
l_threshold如果连接数低于,则清除IP_VS_DEST_F_OVERLOAD,默认为为0
struct ip_vs_dest_user_kern {
/* destination server address */
union nf_inet_addr addr;
__be16 port;
/* real server options */
unsigned int conn_flags; /* connection flags */
int weight; /* destination weight */
/* thresholds for active connections */
u32 u_threshold; /* upper threshold */
u32 l_threshold; /* lower threshold */
/* Address family of addr */
u16 af;
};
ipvsadm下发添加rs命令后,kernel端执行函数ip_vs_add_dest
static int
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
union nf_inet_addr daddr;
__be16 dport = udest->port;
int ret;
EnterFunction(2);
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
}
if (udest->l_threshold > udest->u_threshold) {
pr_err("%s(): lower threshold is higher than upper threshold\n",
__func__);
return -ERANGE;
}
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
rcu_read_lock();
用af,ip和port计算hash,并到 svc->destinations 查找是否已经存在dest,
dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
rcu_read_unlock();
if (dest != NULL) {
IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
return -EEXIST;
}
/*
* Check if the dest already exists in the trash and
* is from the same service
*/
如果dest被删除了但是还有连接在引用它,会把dest放在链表ipvs->dest_trash
上。如果能在ipvs->dest_trash链表上找到完全一样的dest,可以直接取出来使用
dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
if (dest != NULL) {
IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
"dest->refcnt=%d, service %u/%s:%u\n",
IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
atomic_read(&dest->refcnt),
dest->vfwmark,
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
__ip_vs_update_dest(svc, dest, udest, 1);
ret = 0;
} else {
/*
* Allocate and initialize the dest structure
*/
否则需要分配dest结构体,最后也会调用 __ip_vs_update_dest
ret = ip_vs_new_dest(svc, udest, &dest);
}
LeaveFunction(2);
return ret;
}
static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = net_ipvs(svc->net);
struct ip_vs_service *old_svc;
struct ip_vs_scheduler *sched;
int conn_flags;
/* We cannot modify an address and change the address family */
BUG_ON(!add && udest->af != dest->af);
if (add && udest->af != svc->af)
ipvs->mixed_address_family_dests++;
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
* Put the real service in rs_table if not present.
* For now only for NAT!
*/
ip_vs_rs_hash(ipvs, dest);
}
atomic_set(&dest->conn_flags, conn_flags);
/* bind the service */
old_svc = rcu_dereference_protected(dest->svc, 1);
if (!old_svc) {
__ip_vs_bind_svc(dest, svc);
} else {
if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
__ip_vs_svc_put(old_svc, true);
}
}
/* set the dest status flags */
dest->flags |= IP_VS_DEST_F_AVAILABLE;
if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
dest->af = udest->af;
spin_lock_bh(&dest->dst_lock);
__ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
if (add) {
ip_vs_start_estimator(svc->net, &dest->stats);
将dest添加到svc的双向循环链表destinations中。
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
sched = rcu_dereference_protected(svc->scheduler, 1);
调用sched的 add_dest 将重新分配dest,比如对于sh调度器来说,add_dest 为ip_vs_sh_dest_changed
if (sched && sched->add_dest)
sched->add_dest(svc, dest);
} else {
sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched && sched->upd_dest)
sched->upd_dest(svc, dest);
}
}
ip_vs_sh_state 为sh调度器的私有数据结构,buckets用来保存一个dest,IP_VS_SH_TAB_SIZE为256,所以对于sh来说,一共只能保存256个dest,即rs。
ip_vs_sh_reassign会将svc->destinations链表中的dest赋值到buckets->dest,但是buckets容量只有256,即使添加再多的dest,也只有最后添加的256个dest生效,这是在dest的weight为1的情况下的结果。
如果dest的weight越大,则此dest会占用weight个数的buckets。
struct ip_vs_sh_state {
struct rcu_head rcu_head;
struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
};
static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
struct ip_vs_dest *dest)
{
struct ip_vs_sh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
ip_vs_sh_reassign(s, svc);
return 0;
}
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
int d_count;
bool empty;
b = &s->buckets[0];
p = &svc->destinations;
empty = list_empty(p);
d_count = 0;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
dest = rcu_dereference_protected(b->dest, 1);
if (dest)
ip_vs_dest_put(dest);
if (empty)
RCU_INIT_POINTER(b->dest, NULL);
else {
svc->destinations 为双向循环链表
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
ip_vs_dest_hold(dest);
RCU_INIT_POINTER(b->dest, dest);
IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
atomic_read(&dest->weight));
/* Don't move to next dest until filling weight */
if (++d_count >= atomic_read(&dest->weight)) {
p = p->next;
d_count = 0;
}
}
b++;
}
return 0;
}
如果 转发模式为 DR或者tunnel,则添加的dest的port会自动转换成svc的port,可参考下面ipvsadm命令行部分的代码
static int process_options(int argc, char **argv, int reading_stdin)
/*
* The destination port must be equal to the service port
* if the IP_VS_CONN_F_TUNNEL or IP_VS_CONN_F_DROUTE is set.
* Don't worry about this if fwmark is used.
*/
if (!ce.svc.fwmark &&
(fwd_method == IP_VS_CONN_F_TUNNEL ||
fwd_method == IP_VS_CONN_F_DROUTE))
ce.dest.port = ce.svc.port;
[root@test1 ipvsadm-1.31]# ./ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:12 -g
[root@test1 ipvsadm-1.31]# ./ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:13 -g
Destination already exists
DR模式下的数据转发
假设添加了如下svc和rs:
// -s sh指定调度器为sh,source hash
ipvsadm -A -t 1.1.1.10:8080 -s sh
// -g 指定转发模式为 DR
ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:12 -g
client请求server的数据流为: 2.2.2.1:4444->1.1.1.10:8080
在local in hook点上,会经过ip_vs_reply4和ip_vs_remote_request4的处理。ip_vs_reply4会调用ip_vs_out(ops->hooknum, skb, AF_INET);,这是对从本机发出去的报文的处理,先不看它。
重点看ip_vs_remote_request4的流程,如下
static unsigned int
ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
return ip_vs_in(ops->hooknum, skb, AF_INET);
}
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
struct net *net;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
struct netns_ipvs *ipvs;
如果经过ipvs处理了就直接返回
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
/*
* Big tappo:
* - remote client: only PACKET_HOST
* - route: used for struct net when skb->dev is unset
*/
if (unlikely((skb->pkt_type != PACKET_HOST &&
hooknum != NF_INET_LOCAL_OUT) ||
!skb_dst(skb))) {
ip_vs_fill_iph_skb(af, skb, &iph);
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
" ignored in hook %u\n",
skb->pkt_type, iph.protocol,
IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
net = skb_net(skb);
ipvs = net_ipvs(net);
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
获取ip头信息
ip_vs_fill_iph_skb(af, skb, &iph);
...
根据protocol获取protocol data,进而获取pp,pp为协议相关操作集合
/* Protocol supported? */
pd = ip_vs_proto_data_get(net, iph.protocol);
if (unlikely(!pd))
return NF_ACCEPT;
pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
首先根据caddr和cport到连接表ip_vs_conn_tab查找是否已经存在,如果已经存
在走快速路径,调用packet_xmit转发数据,
否则走慢速路径来查找svc,让调度器选择合适的rs,并创建新的连接,绑定转发
函数到packet_xmit,最后也会调用packet_xmit转发数据
cp = pp->conn_in_get(af, skb, &iph, 0);
...
if (unlikely(!cp) && !iph.fragoffs) {
/* No (second) fragments need to enter here, as nf_defrag_ipv6
* replayed fragment zero will already have created the cp
*/
int v;
1. 调用协议相关的conn_schedule函数(对于tcp来说,tcp_conn_schedule),根据
目的ip和port查找是否有匹配的svc,如果找到了,再调用sched->schedule(对
于sh调度器来说,ip_vs_sh_schedule)找出合适的rs,创建ip_vs_conn_new
/* Schedule and create new connection entry into &cp */
if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
return v;
}
...
ip_vs_in_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
2. 调用绑定的转发函数
if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp, &iph);
/* do not touch skb anymore */
else {
IP_VS_DBG_RL("warning: packet_xmit is null");
ret = NF_ACCEPT;
}
}
1 pp->conn_schedule 对于tcp来说为 tcp_conn_schedule
static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
struct ip_vs_iphdr *iph)
{
struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct netns_ipvs *ipvs;
获取四层头
th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
if (th == NULL) {
*verdict = NF_DROP;
return 0;
}
net = skb_net(skb);
ipvs = net_ipvs(net);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
rcu_read_lock();
根据报文中的目的ip和port,查找是否有匹配的svc
if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
(svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
&iph->daddr, th->dest))) {
int ignored;
if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
else
*verdict = NF_DROP;
rcu_read_unlock();
return 0;
}
}
rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_proto_data *pd, int *ignored,
struct ip_vs_iphdr *iph)
{
struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
struct ip_vs_scheduler *sched;
struct ip_vs_dest *dest;
__be16 _ports[2], *pptr;
unsigned int flags;
*ignored = 1;
/*
* IPv6 frags, only the first hit here.
*/
pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return NULL;
...
...
sched = rcu_dereference(svc->scheduler);
if (sched) {
/* read svc->sched_data after svc->scheduler */
smp_rmb();
调用调度器的schedule找到合适的dest,对于sh调度器来说调用的是ip_vs_sh_schedule
dest = sched->schedule(svc, skb, iph);
} else {
dest = NULL;
}
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
}
flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
&& iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
/*
* Create a connection entry.
*/
{
struct ip_vs_conn_param p;
ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
&iph->saddr, pptr[0], &iph->daddr,
pptr[1], &p);
创建新的连接
cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
dest->port ? dest->port : pptr[1],
flags, dest, skb->mark);
if (!cp) {
*ignored = -1;
return NULL;
}
}
IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
"d:%s:%u conn->flags:%X conn->refcnt:%d\n",
ip_vs_fwd_tag(cp),
IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
cp->flags, atomic_read(&cp->refcnt));
ip_vs_conn_stats(cp, svc);
return cp;
}
1.1 调度函数,寻找合适的rs
static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_sh_state *s;
__be16 port = 0;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
port = ip_vs_sh_get_port(skb, iph);
s = (struct ip_vs_sh_state *) svc->sched_data;
if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
else
dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
IP_VS_DBG_ADDR(svc->af, &iph->saddr),
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port));
return dest;
}
static inline struct ip_vs_dest *
ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
const union nf_inet_addr *addr, __be16 port)
{
unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
return (!dest || is_unavailable(dest)) ? NULL : dest;
}
1.2 创建新连接
创建一个新的连接项,通过caddr和cport计算出hash值,并将它加入到hash表ip_vs_conn_tab。
连接项里包含client的信息caddr和cport,service的信息vaddr和vport,真实server的信息daddr和dport。
根据转发模式,绑定不同的函数。
/*
* Create a new connection entry and hash it into the ip_vs_conn_tab
*/
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
struct netns_ipvs *ipvs = net_ipvs(p->net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
p->protocol);
INIT_HLIST_NODE(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->daf = dest_af;
cp->protocol = p->protocol;
ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
cp->cport = p->cport;
/* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
&cp->vaddr, p->vaddr);
cp->vport = p->vport;
ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
cp->fwmark = fwmark;
if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
ip_vs_pe_get(p->pe);
cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
} else {
cp->pe = NULL;
cp->pe_data = NULL;
cp->pe_data_len = 0;
}
spin_lock_init(&cp->lock);
/*
* Set the entry is referenced by the current thread before hashing
* it in the table, so that other thread run ip_vs_random_dropentry
* but cannot drop this entry.
*/
atomic_set(&cp->refcnt, 1);
cp->control = NULL;
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
cp->packet_xmit = NULL;
cp->app = NULL;
cp->app_data = NULL;
/* reset struct ip_vs_seq */
cp->in_seq.delta = 0;
cp->out_seq.delta = 0;
atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind the connection with a destination server */
cp->dest = NULL;
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
cp->state = 0;
cp->old_state = 0;
cp->timeout = 3*HZ;
cp->sync_endtime = jiffies & ~3UL;
/* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
if (p->af == AF_INET6)
ip_vs_bind_xmit_v6(cp);
else
#endif
根据转发模式,绑定不同的函数
ip_vs_bind_xmit(cp);
if (unlikely(pd && atomic_read(&pd->appcnt)))
ip_vs_bind_app(cp, pd->pp);
/*
* Allow conntrack to be preserved. By default, conntrack
* is created and destroyed for every packet.
* Sometimes keeping conntrack can be useful for
* IP_VS_CONN_F_ONE_PACKET too.
*/
if (ip_vs_conntrack_enabled(ipvs))
cp->flags |= IP_VS_CONN_F_NFCT;
/* Hash it in the ip_vs_conn_tab finally */
ip_vs_conn_hash(cp);
return cp;
}
cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
创建连接时,根据转发模式绑定相应的转发函数
/*
* Bind a connection entry with the corresponding packet_xmit.
* Called by ip_vs_conn_new.
*/
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
switch (IP_VS_FWD_METHOD(cp)) {
case IP_VS_CONN_F_MASQ:
cp->packet_xmit = ip_vs_nat_xmit;
break;
case IP_VS_CONN_F_TUNNEL:
#ifdef CONFIG_IP_VS_IPV6
if (cp->daf == AF_INET6)
cp->packet_xmit = ip_vs_tunnel_xmit_v6;
else
#endif
cp->packet_xmit = ip_vs_tunnel_xmit;
break;
case IP_VS_CONN_F_DROUTE:
cp->packet_xmit = ip_vs_dr_xmit;
break;
case IP_VS_CONN_F_LOCALNODE:
cp->packet_xmit = ip_vs_null_xmit;
break;
case IP_VS_CONN_F_BYPASS:
cp->packet_xmit = ip_vs_bypass_xmit;
break;
}
}
- DR模式下的转发函数 ip_vs_dr_xmit
int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
int local;
EnterFunction(10);
rcu_read_lock();
根据 cp->daddr.ip 查找路由,而不是根据skb中的目的ip(vip)
local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
if (local < 0)
goto tx_error;
if (local) {
rcu_read_unlock();
如果路由结果为local,则返回accept即可
return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
ip_send_check(ip_hdr(skb));
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
不是local,需要经过NF_INET_LOCAL_OUT处理发送出去
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
struct ip_vs_conn *cp, int local)
{
int ret = NF_STOLEN;
ipvs_property 置为1,表示已经经过ipvs处理
skb->ipvs_property = 1;
if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
ip_vs_notrack(skb);
if (!local) {
ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
dst_output);
} else
ret = NF_ACCEPT;
return ret;
}
在NF_INET_LOCAL_OUT hook点上,还有另外两个ipvs的hook函数执行,但是因为skb->ipvs_property置位了,所以不会再次处理,返回accept。
ip_vs_local_reply4
ip_vs_out
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
ip_vs_local_request4
ip_vs_in
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
从hook函数返回后,调用dst_output,即调用ip_output->ip_finish_output2,
int ip_output(struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
经过NF_INET_POST_ROUTING hook点,也没特殊处理,直接通过后调用ip_finish_output
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm != NULL) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
if (skb_is_gso(skb))
return ip_finish_output_gso(skb);
if (skb->len > ip_skb_dst_mtu(skb))
return ip_fragment(skb, ip_finish_output2);
return ip_finish_output2(skb);
}
ip_finish_output->ip_finish_output2调用邻居子系统函数填充mac地址,此时才是重点,需要将源mac设置为lb mac,目的mac设置为rs mac。因为之前查找路由时,就是使用的rs 地址,而不是数据包中vip地址。
所以路由表项rt->rt_gateway的值为: 如果是同网段的,则rt->rt_gateway为目的rs地址,如果是不同网段的,则rt->rt_gateway为网关地址。因为LB和RS在同网段,所以rt->rt_gateway就是RS的地址。
static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
{
if (rt->rt_gateway)
return rt->rt_gateway;
return daddr;
}
//根据nexthop获取neigh信息,如果neigh为空,则需创建新的neigh。
//有了rs地址的mac地址,才会将报文发出去。
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
tbl->constructor(n) //arp_constructor
neigh->ops = &arp_hh_ops;
neigh->output = neigh->ops->output; //neigh_resolve_output
if (!IS_ERR(neigh)) {
int res = dst_neigh_output(dst, neigh, skb);
dst_neigh_output
//如果arp状态是正常的,则调用 dev_queue_xmit 发送出去
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
return neigh_hh_output(hh, skb);
else
//在neigh_resolve_output中,缓存数据包到arp_queue队列,发送arp请求,
//等到接收到arp回应后,再发送出去
return n->output(n, skb); //neigh_resolve_output
- DR模式转发总结
a. DR模式下,LB转发数据给RS时,不会修改ip层信息,只修改目的mac为RS的 mac,所以LB和RS必须在同一网段。
b. LB和RS都需要接收目的ip为vip的数据包,所以LB和RS上都需要配置vip。
LB上一般通过keepalived实现主备两个LB,vip可以在两个LB上漂移。
RS上的vip需要配置在lo接口上(掩码为32),并且设置RS上所有网卡如下配置
arp_ignore=1, -只响应请求地址为接收网卡上地址的arp请求
arp_announce=2 -忽略数据包中源ip,选择发送网卡上合适的ip作为arp请求的源ip
c. client的请求需要经过LB转发到RS,但是响应报文不会经过LB,所以需要保证RS和client网络可达,而且RS的网关不能指向LB。
d. 不支持端口映射,所以命令行添加rs时不用指定port,即使指定了也会自动转换成svc的port。
由上面代码可知,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4。
TUNNEL转发模式
[http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.LVS-Tun.html]
If you want to try a test LVS-Tun setup on the bench, take a standard LVS-DR setup LVS-DR example, change lo on the realservers to tunl0 (and handle the ARP problem on tunl0) and change the ipvsadm switch from -g to -i . If your clients are going to be sending large packets, you need to set the MTU (see MTU for the ipip packet DIP->RIP). This can be done on the realserver with iptables (see tunl MTU solved) or iproute2 (see setting the MTU by route).
上面这段话的意思是如果想搭建个lvs-tunnel的实验环境,可以在lvs-dr环境上稍作修改即可。vip从lo改成配置在tunl0上,ipvsadm添加rs时,转发模式从-i改成-g。
In LVS-Tun, the tunl0 device holds the VIP, just as the lo device holds the device for LVS-DR. You need to build the tunl0 device into the Linux kernel (in networking options - IP:tunneling) - it is turned off by default. The tunnelling (ipip) can be built as a module, in which case you'll have to insmod ipip before you can use it, or you can build ipip directly into the kernel. With a kernel enabled for ipip, you should be able to see the unconfigured tunl0 device with ifconfig or with ip addr show (Feb 2004 - my ifconfig used to see the unconfigured tunl0, but it doesn't anymore.)
上面这段话的意思是,如果ipip模块编译进kernel了,就会自动生成一个虚拟网卡设备tunl0,如果没有编译进内核,需要使用modprobe ipip加载一下这个模块,这样也会生成tunl0设备。如下所示
[root@test1 ~]# modprobe ipip
[root@test1 ~]# lsmod | grep ipip
ipip 16384 0
tunnel4 16384 1 ipip
ip_tunnel 24576 1 ipip
[root@test1 ~]# ip a
...
4: tunl0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN group default qlen 1000
link/ipip 0.0.0.0 brd 0.0.0.0
Then you configure the tunl0 device
ifconfig tunl0 192.168.1.110 netmask 255.255.255.255 broadcast 192.168.1.110
or
ip addr add dev tunl0 192.168.1.110/32 brd 192.168.1.110
Note
the VIP is a /32 addr, so the brd addr is the VIP, not x.x.x.255.
LB端隧道报文封装不依赖其他module,在 ip_vs_tunnel_xmit 中直接封装成ipip包发出去。这也就限制了只支持ipip隧道。
RS接收端需要提前加载ipip module,来解封装ipip报文。处理完后,将响应报文直接发给client,不用封装,也不用经过LB。
在LB上的代码处理流程和lvs-dr大部分是一样的,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4,只有最后调用转发函数时,ip_vs_dr_xmit变成了ip_vs_tunnel_xmit。
/*
* IP Tunneling transmitter
*
* This function encapsulates the packet in a new IP packet, its
* destination will be set to cp->daddr. Most code of this function
* is taken from ipip.c.
*
* It is used in VS/TUN cluster. The load balancer selects a real
* server from a cluster based on a scheduling algorithm,
* encapsulates the request packet and forwards it to the selected
* server. For example, all real servers are configured with
* "ifconfig tunl0 <Virtual IP Address> up". When the server receives
* the encapsulated packet, it will decapsulate the packet, processe
* the request and return the response packets directly to the client
* without passing the load balancer. This can greatly increase the
* scalability of virtual server.
*
* Used for ANY protocol
*/
int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
struct rtable *rt; /* Route to the other host */
__be32 saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
__u8 next_protocol = 0;
__u8 dsfield = 0;
__u8 ttl = 0;
__be16 df = 0;
__be16 *dfp = NULL;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
EnterFunction(10);
rcu_read_lock();
根据cp->daddr查找路由,注意daddr是调度器选出的rs的ip,不是数据包中的vip
local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_CONNECT |
IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
if (local < 0)
goto tx_error;
如果查找结果是local,则返回accept
if (local) {
rcu_read_unlock();
return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
取出路由结果中的出口设备
rt = skb_rtable(skb);
tdev = rt->dst.dev;
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
计算出口设备可容纳的最大长度
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
先计算下max_headroom能不能容纳封装后报文长度,同时取出ip头里的field
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
&next_protocol, NULL, &dsfield,
&ttl, dfp);
if (IS_ERR(skb))
goto tx_error;
skb = iptunnel_handle_offloads(
skb, false, __tun_gso_type_mask(AF_INET, cp->af));
if (IS_ERR(skb))
goto tx_error;
skb->transport_header = skb->network_header;
在skb ip头前面腾出一个ip头的长度,用来封装外层ip
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
外层ip头赋值,saddr为查找路由后给定,daddr为cp->daddr,即rs的ip
/*
* Push down and install the IPIP header.
*/
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = sizeof(struct iphdr)>>2;
iph->frag_off = df;
iph->protocol = next_protocol;
iph->tos = dsfield;
iph->daddr = cp->daddr.ip;
iph->saddr = saddr;
iph->ttl = ttl;
ip_select_ident(skb, NULL);
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
发给LOCAL_OUT hook点处理,此时的报文已经封装成ipip报文,外层ip是LB
和rs的隧道ip,内层ip是client和vip。
ip_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
tx_error:
if (!IS_ERR(skb))
kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
DNAT转发模式
在LB上的代码处理流程和lvs-dr大部分是一样的,从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook,hook点上分别有两个hook函数:ip_vs_reply4和ip_vs_remote_request4,ip_vs_local_reply4和ip_vs_local_request4,但是真正处理的只有local in上的ip_vs_remote_request4,只有最后调用转发函数时,ip_vs_dr_xmit变成了ip_vs_nat_xmit。
int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rtable *rt; /* Route to the other host */
int local, rc, was_input;
EnterFunction(10);
rcu_read_lock();
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
was_input = rt_is_input_route(skb_rtable(skb));
根据 cp->daddr.ip 查找路由,而不是根据skb中的目的ip(vip)
local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR, NULL, ipvsh);
if (local < 0)
goto tx_error;
rt = skb_rtable(skb);
...
调用协议相关的 tcp_dnat_handler,修改数据包的目的port为cp->dport
/* mangle the packet */
if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
goto tx_error;
修改目的ip为 cp->daddr.ip
ip_hdr(skb)->daddr = cp->daddr.ip;
重新计算校验和
ip_send_check(ip_hdr(skb));
IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
rcu_read_unlock();
LeaveFunction(10);
return rc;
tx_error:
kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
struct ip_vs_conn *cp, int local)
{
int ret = NF_STOLEN;
skb->ipvs_property = 1;
if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 1);
/* Remove the early_demux association unless it's bound for the
* exact same port and address on this host after translation.
*/
if (!local || cp->vport != cp->dport ||
!ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
ip_vs_drop_early_demux_sk(skb);
if (!local) {
skb_forward_csum(skb);
同样的,将dnat后的数据包调用local out发送出去
NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
dst_output);
} else
ret = NF_ACCEPT;
return ret;
}
所以到达lb的数据流为:cip:cport->vip:vport,
经过dnat后的数据流为:cip:cport->rip:rport.
rs处理完后的响应数据流为:rip:rport->cip:cport,需要将rip:rport还原成vip:vport,所以此数据流必须发给lb做snat。又因为目的ip不是lb的ip,所以必须将rs的默认网关指向lb。当数据流到达lb后,查找路由表发现目的ip不是lb的ip,所以需要转发此数据包(必须保证net.ipv4.ip_forward = 1),将走ip_forward函数转发,函数最后需要经过NF_INET_FORWARD hook点的处理,
NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
rt->dst.dev, ip_forward_finish);
此hook点注册了两个和ipvs相关的函数ip_vs_forward_icmp和ip_vs_reply4,很显然前一个是处理icmp的,重点是ip_vs_reply4。
static unsigned int
ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
return ip_vs_out(ops->hooknum, skb, AF_INET);
}
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
struct net *net = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
EnterFunction(11);
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
/* Bad... Do not break raw sockets */
if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(skb->sk);
if (inet && sk->sk_family == PF_INET && inet->nodefrag)
return NF_ACCEPT;
}
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
net = skb_net(skb);
if (!net_ipvs(net)->enable)
return NF_ACCEPT;
ip_vs_fill_iph_skb(af, skb, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
int verdict = ip_vs_out_icmp_v6(skb, &related,
hooknum, &iph);
if (related)
return verdict;
}
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
int verdict = ip_vs_out_icmp(skb, &related, hooknum);
if (related)
return verdict;
}
pd = ip_vs_proto_data_get(net, iph.protocol);
if (unlikely(!pd))
return NF_ACCEPT;
pp = pd->pp;
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET)
#endif
if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
if (ip_vs_gather_frags(skb,
ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
}
/*
* Check if the packet belongs to an existing entry
*/
因为从client到rs是通过cip和cport创建的连接表,所以反方向是通过
目的ip和port(也就是cip和cport)查找是否有连接表
cp = pp->conn_out_get(af, skb, &iph, 0);
如果查找到连接表,才需要处理
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph);
...
}
static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct ip_vs_protocol *pp = pd->pp;
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
if (!skb_make_writable(skb, iph->len))
goto drop;
/* mangle the packet */
调用协议相关的 snat_handler 处理数据包,即 tcp_snat_handler
将源port换成vport
if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
goto drop;
{
修改源ip为vaddr
ip_hdr(skb)->saddr = cp->vaddr.ip;
ip_send_check(ip_hdr(skb));
}
/*
* nf_iterate does not expect change in the skb->dst->dev.
* It looks like it is not fatal to enable this code for hooks
* where our handlers are at the end of the chain list and
* when all next handlers use skb->dst->dev and not outdev.
* It will definitely route properly the inout NAT traffic
* when multiple paths are used.
*/
/* For policy routing, packets originating from this
* machine itself may be routed differently to packets
* passing through. We want this packet to be routed as
* if it came from this machine itself. So re-compute
* the routing information.
*/
if (ip_vs_route_me_harder(af, skb))
goto drop;
IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
if (!(cp->flags & IP_VS_CONN_F_NFCT))
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
ip_vs_conn_put(cp);
LeaveFunction(11);
最后返回accept即可,从hook函数返回后,会调用ip_forward_finish最终发给client端
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
LeaveFunction(11);
return NF_STOLEN;
}
dnat转发总结:
RS的网关必须指向DIP
请求和响应报文都需要经过LB,流量很大的话,LB会成为瓶颈
支持端口映射














网友评论