lvs四层负载均衡

作者: 分享放大价值 | 来源:发表于2020-07-24 22:47 被阅读0次

负载均衡集群介绍、LVS NAT模式搭建、LVS DR模式搭建、
负载均衡集群介绍、LVS介绍、LVS调度算法、LVS NAT模式
负载均衡-LVS
LVS实现负载均衡原理及安装配置详解
lvs
使用LVS实现负载均衡原理及安装配置详解
7层负载均衡知识2018-08-21
服务器优化----Tomcat优化
02_高并发_03_Keepalived高可用
Nginx-负载均衡

lvs是基于netfilter框架实现的四层负载均衡器，包含两部分，一部分是用户态的ipvsadm配置管理命令，另一部分是内核态的核心ko。

lvs常用的转发模式是DR，tunnel和dnat。DR和tunnel模式下，只有请求报文会经过lvs，响应报文由rs直接返给client。dnat模式下，需要有端口映射，所以响应报文也必须经过lvs做snat后才能发给client。

lvs使用调度器选择合适的rs，可在代码中搜register_ip_vs_scheduler查看支持的调度器。

模块初始化

module_init(ip_vs_init);
static int __init ip_vs_init(void)
{
    int ret;

    ret = ip_vs_control_init();
    if (ret < 0) {
        pr_err("can't setup control.\n");
        goto exit;
    }

    ip_vs_protocol_init();

    ret = ip_vs_conn_init();
    if (ret < 0) {
        pr_err("can't setup connection table.\n");
        goto cleanup_protocol;
    }

    ret = register_pernet_subsys(&ipvs_core_ops);   /* Alloc ip_vs struct */
    if (ret < 0)
        goto cleanup_conn;

    ret = register_pernet_device(&ipvs_core_dev_ops);
    if (ret < 0)
        goto cleanup_sub;

    ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
    if (ret < 0) {
        pr_err("can't register hooks.\n");
        goto cleanup_dev;
    }

    ret = ip_vs_register_nl_ioctl();
    if (ret < 0) {
        pr_err("can't register netlink/ioctl.\n");
        goto cleanup_hooks;
    }

    pr_info("ipvs loaded.\n");

    return ret;

cleanup_hooks:
    nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
cleanup_dev:
    unregister_pernet_device(&ipvs_core_dev_ops);
cleanup_sub:
    unregister_pernet_subsys(&ipvs_core_ops);
cleanup_conn:
    ip_vs_conn_cleanup();
cleanup_protocol:
    ip_vs_protocol_cleanup();
    ip_vs_control_cleanup();
exit:
    return ret;
}

1. ip_vs_control_init
/*
 *  Hash table: for virtual service lookups
 */
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)

/* the service table hashed by <protocol, addr, port> */
static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];

int __init ip_vs_control_init(void)
{
    int idx;
    int ret;

    EnterFunction(2);
    初始化两张hash链表，hash桶大小为256
    /* Initialize svc_table, ip_vs_svc_fwm_table */
    for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
        INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
        INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
    }

    smp_wmb();  /* Do we really need it now ? */
    注册网络设备状态变化通知函数，如果有网卡down了会调用ip_vs_dst_event，清除和此设备相关的表项
    ret = register_netdevice_notifier(&ip_vs_dst_notifier);
    if (ret < 0)
        return ret;

    LeaveFunction(2);
    return 0;
}

2.ip_vs_protocol_init
将协议相关的ip_vs_protocol 注册到 ip_vs_proto_table
struct ip_vs_protocol ip_vs_protocol_tcp = {
    .name =         "TCP",
    .protocol =     IPPROTO_TCP,
    .num_states =       IP_VS_TCP_S_LAST,
    .dont_defrag =      0,
    .init =         NULL,
    .exit =         NULL,
    .init_netns =       __ip_vs_tcp_init,
    .exit_netns =       __ip_vs_tcp_exit,
    .register_app =     tcp_register_app,
    .unregister_app =   tcp_unregister_app,
    .conn_schedule =    tcp_conn_schedule,
    .conn_in_get =      ip_vs_conn_in_get_proto,
    .conn_out_get =     ip_vs_conn_out_get_proto,
    .snat_handler =     tcp_snat_handler,
    .dnat_handler =     tcp_dnat_handler,
    .csum_check =       tcp_csum_check,
    .state_name =       tcp_state_name,
    .state_transition = tcp_state_transition,
    .app_conn_bind =    tcp_app_conn_bind,
    .debug_packet =     ip_vs_tcpudp_debug_packet,
    .timeout_change =   tcp_timeout_change,
};

#define IP_VS_PROTO_TAB_SIZE        32  /* must be power of 2 */
#define IP_VS_PROTO_HASH(proto)     ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];

static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
    unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);

    pp->next = ip_vs_proto_table[hash];
    ip_vs_proto_table[hash] = pp;

    if (pp->init != NULL)
        pp->init(pp);

    return 0;
}

int __init ip_vs_protocol_init(void)
{
    char protocols[64];
#define REGISTER_PROTOCOL(p)            \
    do {                    \
        register_ip_vs_protocol(p); \
        strcat(protocols, ", ");    \
        strcat(protocols, (p)->name);   \
    } while (0)

    protocols[0] = '\0';
    protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
    REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
    REGISTER_PROTOCOL(&ip_vs_protocol_udp);
#endif
#ifdef CONFIG_IP_VS_PROTO_SCTP
    REGISTER_PROTOCOL(&ip_vs_protocol_sctp);
#endif
#ifdef CONFIG_IP_VS_PROTO_AH
    REGISTER_PROTOCOL(&ip_vs_protocol_ah);
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
    REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
    pr_info("Registered protocols (%s)\n", &protocols[2]);

    return 0;
}

3. ip_vs_conn_init
#define CONFIG_IP_VS_TAB_BITS   12
static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
int __init ip_vs_conn_init(void)
{
    int idx;
    连接表大小为 1 <<12 = 4096 
    /* Compute size and mask */
    ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
    ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;

    /*
     * Allocate the connection hash table and initialize its list heads
     */
    分配4096个ip_vs_conn_tab
    ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
    if (!ip_vs_conn_tab)
        return -ENOMEM;

    /* Allocate ip_vs_conn slab cache */
    ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
                          sizeof(struct ip_vs_conn), 0,
                          SLAB_HWCACHE_ALIGN, NULL);
    if (!ip_vs_conn_cachep) {
        vfree(ip_vs_conn_tab);
        return -ENOMEM;
    }

    pr_info("Connection hash table configured "
        "(size=%d, memory=%ldKbytes)\n",
        ip_vs_conn_tab_size,
        (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
    IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
          sizeof(struct ip_vs_conn));

    for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
        INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);

    for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
        spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
    }

    /* calculate the random value for connection hash */
    get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));

    return 0;
}

4.注册pernet操作 ipvs_core_ops和ipvs_core_dev_ops
对每个net namespace都会调用 init 函数进行pernet的初始化。

ipvs_core_dev_ops 只提供了exit，只在卸载模块时调用。
static struct pernet_operations ipvs_core_ops = {
    .init = __ip_vs_init,
    .exit = __ip_vs_cleanup,
    .id   = &ip_vs_net_id,
    .size = sizeof(struct netns_ipvs),
};

static struct pernet_operations ipvs_core_dev_ops = {
    .exit = __ip_vs_dev_cleanup,
};
主要是初始化 netns_ipvs 结构体的字段
static int __net_init __ip_vs_init(struct net *net)
{
    struct netns_ipvs *ipvs;

    ipvs = net_generic(net, ip_vs_net_id);
    if (ipvs == NULL)
        return -ENOMEM;

    /* Hold the beast until a service is registerd */
    ipvs->enable = 0;
    ipvs->net = net;
    /* Counters used for creating unique names */
    ipvs->gen = atomic_read(&ipvs_netns_cnt);
    atomic_inc(&ipvs_netns_cnt);
    net->ipvs = ipvs;
    初始化和estimator相关的字段
    if (ip_vs_estimator_net_init(net) < 0)
        goto estimator_fail;

    if (ip_vs_control_net_init(net) < 0)
        goto control_fail;

    if (ip_vs_protocol_net_init(net) < 0)
        goto protocol_fail;

    if (ip_vs_app_net_init(net) < 0)
        goto app_fail;

    if (ip_vs_conn_net_init(net) < 0)
        goto conn_fail;

    if (ip_vs_sync_net_init(net) < 0)
        goto sync_fail;

    printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
             sizeof(struct netns_ipvs), ipvs->gen);
    return 0;
/*
 * Error handling
 */

sync_fail:
    ip_vs_conn_net_cleanup(net);
conn_fail:
    ip_vs_app_net_cleanup(net);
app_fail:
    ip_vs_protocol_net_cleanup(net);
protocol_fail:
    ip_vs_control_net_cleanup(net);
control_fail:
    ip_vs_estimator_net_cleanup(net);
estimator_fail:
    net->ipvs = NULL;
    return -ENOMEM;
}

5. 注册hook函数到netfilter框架
static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook       = ip_vs_reply4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_IN,
        .priority   = NF_IP_PRI_NAT_SRC - 2,
    },
    /* After packet filtering, forward packet through VS/DR, VS/TUN,
     * or VS/NAT(change destination), so that filtering rules can be
     * applied to IPVS. */
    {
        .hook       = ip_vs_remote_request4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_IN,
        .priority   = NF_IP_PRI_NAT_SRC - 1,
    },
    /* Before ip_vs_in, change source only for VS/NAT */
    {
        .hook       = ip_vs_local_reply4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_OUT,
        .priority   = NF_IP_PRI_NAT_DST + 1,
    },
    /* After mangle, schedule and forward local requests */
    {
        .hook       = ip_vs_local_request4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_OUT,
        .priority   = NF_IP_PRI_NAT_DST + 2,
    },
    /* After packet filtering (but before ip_vs_out_icmp), catch icmp
     * destined for 0.0.0.0/0, which is for incoming IPVS connections */
    {
        .hook       = ip_vs_forward_icmp,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_FORWARD,
        .priority   = 99,
    },
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook       = ip_vs_reply4,
        .owner      = THIS_MODULE,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_FORWARD,
        .priority   = 100,
    },
}
nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));

6. ip_vs_register_nl_ioctl
注册两种用户态和内核态通信的方法：sockopt和netlink，ipvsadm可以使用两者之一下发命令
int __init ip_vs_register_nl_ioctl(void)
{
    int ret;

    ret = nf_register_sockopt(&ip_vs_sockopts);
    if (ret) {
        pr_err("cannot register sockopt.\n");
        goto err_sock;
    }

    ret = ip_vs_genl_register();
    if (ret) {
        pr_err("cannot register Generic Netlink interface.\n");
        goto err_genl;
    }
    return 0;

err_genl:
    nf_unregister_sockopt(&ip_vs_sockopts);
err_sock:
    return ret;
}

注册调度器

调用 register_ip_vs_scheduler 将调度器注册到链表 ip_vs_schedulers。
3.18.79内核实现中，已经有了十几种调度器的实现，每一种调度器都是以module形式存在，加载module时进行初始化，注册调度器。

int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
{
    struct ip_vs_scheduler *sched;

    if (!scheduler) {
        pr_err("%s(): NULL arg\n", __func__);
        return -EINVAL;
    }

    if (!scheduler->name) {
        pr_err("%s(): NULL scheduler_name\n", __func__);
        return -EINVAL;
    }

    /* increase the module use count */
    ip_vs_use_count_inc();

    mutex_lock(&ip_vs_sched_mutex);

    if (!list_empty(&scheduler->n_list)) {
        mutex_unlock(&ip_vs_sched_mutex);
        ip_vs_use_count_dec();
        pr_err("%s(): [%s] scheduler already linked\n",
               __func__, scheduler->name);
        return -EINVAL;
    }

    /*
     *  Make sure that the scheduler with this name doesn't exist
     *  in the scheduler list.
     */
    list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
        if (strcmp(scheduler->name, sched->name) == 0) {
            mutex_unlock(&ip_vs_sched_mutex);
            ip_vs_use_count_dec();
            pr_err("%s(): [%s] scheduler already existed "
                   "in the system\n", __func__, scheduler->name);
            return -EINVAL;
        }
    }
    /*
     *  Add it into the d-linked scheduler list
     */
    list_add(&scheduler->n_list, &ip_vs_schedulers);
    mutex_unlock(&ip_vs_sched_mutex);

    pr_info("[%s] scheduler registered.\n", scheduler->name);

    return 0;
}

下面为sh(source ip hash)调度器，后面的分析也会以sh为例

module_init(ip_vs_sh_init);

static struct ip_vs_scheduler ip_vs_sh_scheduler =
{
    .name =         "sh",
    .refcnt =       ATOMIC_INIT(0),
    .module =       THIS_MODULE,
    .n_list  =      LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
    .init_service =     ip_vs_sh_init_svc,
    .done_service =     ip_vs_sh_done_svc,
    .add_dest =     ip_vs_sh_dest_changed,
    .del_dest =     ip_vs_sh_dest_changed,
    .upd_dest =     ip_vs_sh_dest_changed,
    .schedule =     ip_vs_sh_schedule,
};
static int __init ip_vs_sh_init(void)
{
    return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
}

创建service，添加rs

ipvs的基本用法如下，首先创建一个service，指定协议，ip和端口号，指定调度算法。再给这个service添加真实的后端ip和端口，并指定转发模式。
这里的service是对外提供的，收到client的请求后(目的ip为service ip)，根据调度算法从后端ip池找到合适的真实server，根据转发模式发送到真实server。

ipvsadm -C
//添加虚拟service，调度算法为sh，-t表示tcp
ipvsadm -A -t 1.1.1.10:8080 -s sh
//给service添加真实ip:port，-g表示工作模式为DR直接路由
ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.4:8080 -g
ipvsadm -ln

下面分别看一下创建servcie和添加rs的代码流程

创建service

这个结构体是用户态命令行设置规则并下发给kernel的
struct ip_vs_service_user_kern {
    /* virtual service addresses */
    u16         af;
    u16         protocol;
    union nf_inet_addr  addr;       /* virtual ip address */
    __be16          port;
    u32         fwmark;     /* firwall mark of service */

    /* virtual service options */
    char            *sched_name;
    char            *pe_name;
    unsigned int        flags;      /* virtual service flags */
    unsigned int        timeout;    /* persistent timeout in sec */
    __be32          netmask;    /* persistent netmask or plen */
};

static int
ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
          struct ip_vs_service **svc_p)
{
    int ret = 0, i;
    struct ip_vs_scheduler *sched = NULL;
    struct ip_vs_pe *pe = NULL;
    struct ip_vs_service *svc = NULL;
    struct netns_ipvs *ipvs = net_ipvs(net);

    /* increase the module use count */
    ip_vs_use_count_inc();
    根据指定的调度器名字查找，如果第一次查找失败，会尝试自动加载module，再次查找。两次查找都失败就返回报错
    /* Lookup the scheduler by 'u->sched_name' */
    if (strcmp(u->sched_name, "none")) {
        sched = ip_vs_scheduler_get(u->sched_name);
        if (!sched) {
            pr_info("Scheduler module ip_vs_%s not found\n",
                u->sched_name);
            ret = -ENOENT;
            goto out_err;
        }
    }
    根据pe name获取pe
    if (u->pe_name && *u->pe_name) {
        pe = ip_vs_pe_getbyname(u->pe_name);
        if (pe == NULL) {
            pr_info("persistence engine module ip_vs_pe_%s "
                "not found\n", u->pe_name);
            ret = -ENOENT;
            goto out_err;
        }
    }

    分配service结构体
    svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
    if (svc == NULL) {
        IP_VS_DBG(1, "%s(): no memory\n", __func__);
        ret = -ENOMEM;
        goto out_err;
    }
   分配percpu统计结构
    svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
    if (!svc->stats.cpustats) {
        ret = -ENOMEM;
        goto out_err;
    }
    初始化percpu 统计
    for_each_possible_cpu(i) {
        struct ip_vs_cpu_stats *ip_vs_stats;
        ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
        u64_stats_init(&ip_vs_stats->syncp);
    }


    /* I'm the first user of the service */
    atomic_set(&svc->refcnt, 0);
    将命令行设置的内容赋给service
    svc->af = u->af;
    svc->protocol = u->protocol;
    ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
    svc->port = u->port;
    svc->fwmark = u->fwmark;
    svc->flags = u->flags;
    svc->timeout = u->timeout * HZ;
    svc->netmask = u->netmask;
    svc->net = net;
    初始化destinations链表，用于存放rs
    INIT_LIST_HEAD(&svc->destinations);
    spin_lock_init(&svc->sched_lock);
    spin_lock_init(&svc->stats.lock);

    /* Bind the scheduler */
    if (sched) {
        将svc和指定的调度器绑定，即调用调度器的init_service函数，并将sched赋值给svc->scheduler。
        init_service函数会分配每个sched私有结构体，存放在svc->sched_data
        ret = ip_vs_bind_scheduler(svc, sched);
        if (ret)
            goto out_err;
        sched = NULL;
    }

    /* Bind the ct retriever */
    RCU_INIT_POINTER(svc->pe, pe);
    pe = NULL;

    /* Update the virtual service counters */
    if (svc->port == FTPPORT)
        atomic_inc(&ipvs->ftpsvc_counter);
    else if (svc->port == 0)
        atomic_inc(&ipvs->nullsvc_counter);
将svc->stats放入链表ipvs->est_list
    ip_vs_start_estimator(net, &svc->stats);

    /* Count only IPv4 services for old get/setsockopt interface */
    if (svc->af == AF_INET)
        增加service个数
        ipvs->num_services++;
    将svc添加到全局变量 ip_vs_svc_table 或者ip_vs_svc_fwm_table
    Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
    Hash it by fwmark in svc_fwm_table
    /* Hash the service into the service table */
    ip_vs_svc_hash(svc);

    *svc_p = svc;
    /* Now there is a service - full throttle */
    只要有一个service，就会设置enable
    ipvs->enable = 1;
    return 0;

 out_err:
    if (svc != NULL) {
        ip_vs_unbind_scheduler(svc, sched);
        ip_vs_service_free(svc);
    }
    ip_vs_scheduler_put(sched);
    ip_vs_pe_put(pe);

    /* decrease the module use count */
    ip_vs_use_count_dec();

    return ret;
}

添加rs

同样的，使用下面结构体传递参数。
conn_flags 指定了转发模式，比如DR，tunnel等，如果命令行不指定，默认为DR模式。
weight 表示此rs的权重，值越大被选中的概率越大。如果命令行不指定，默认为1。
port为命令行指定的端口号，但是如果是DR或者tunnel模式，会自动转换成service的port，因为DR或者tunnel模式下，不会修改数据包的端口号。
u_threshold 如果连接数超过此值会给此dest设置IP_VS_DEST_F_OVERLOAD标志，表示此dest负载太多，默认值为0。
l_threshold如果连接数低于，则清除IP_VS_DEST_F_OVERLOAD，默认为为0

struct ip_vs_dest_user_kern {
    /* destination server address */
    union nf_inet_addr  addr;
    __be16          port;

    /* real server options */
    unsigned int        conn_flags; /* connection flags */
    int         weight;     /* destination weight */

    /* thresholds for active connections */
    u32         u_threshold;    /* upper threshold */
    u32         l_threshold;    /* lower threshold */

    /* Address family of addr */
    u16         af;
};
ipvsadm下发添加rs命令后，kernel端执行函数ip_vs_add_dest
static int
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
    struct ip_vs_dest *dest;
    union nf_inet_addr daddr;
    __be16 dport = udest->port;
    int ret;

    EnterFunction(2);

    if (udest->weight < 0) {
        pr_err("%s(): server weight less than zero\n", __func__);
        return -ERANGE;
    }

    if (udest->l_threshold > udest->u_threshold) {
        pr_err("%s(): lower threshold is higher than upper threshold\n",
            __func__);
        return -ERANGE;
    }

    ip_vs_addr_copy(udest->af, &daddr, &udest->addr);

    /* We use function that requires RCU lock */
    rcu_read_lock();
    用af，ip和port计算hash，并到 svc->destinations 查找是否已经存在dest，
    dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
    rcu_read_unlock();

    if (dest != NULL) {
        IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
        return -EEXIST;
    }

    /*
     * Check if the dest already exists in the trash and
     * is from the same service
     */
    如果dest被删除了但是还有连接在引用它，会把dest放在链表ipvs->dest_trash 
   上。如果能在ipvs->dest_trash链表上找到完全一样的dest，可以直接取出来使用
    dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
    if (dest != NULL) {
        IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
                  "dest->refcnt=%d, service %u/%s:%u\n",
                  IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
                  atomic_read(&dest->refcnt),
                  dest->vfwmark,
                  IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
                  ntohs(dest->vport));

        __ip_vs_update_dest(svc, dest, udest, 1);
        ret = 0;
    } else {
        /*
         * Allocate and initialize the dest structure
         */
        否则需要分配dest结构体，最后也会调用 __ip_vs_update_dest
        ret = ip_vs_new_dest(svc, udest, &dest);
    }
    LeaveFunction(2);

    return ret;
}

static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
            struct ip_vs_dest_user_kern *udest, int add)
{
    struct netns_ipvs *ipvs = net_ipvs(svc->net);
    struct ip_vs_service *old_svc;
    struct ip_vs_scheduler *sched;
    int conn_flags;

    /* We cannot modify an address and change the address family */
    BUG_ON(!add && udest->af != dest->af);

    if (add && udest->af != svc->af)
        ipvs->mixed_address_family_dests++;

    /* set the weight and the flags */
    atomic_set(&dest->weight, udest->weight);
    conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
    conn_flags |= IP_VS_CONN_F_INACTIVE;

    /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
    if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
        conn_flags |= IP_VS_CONN_F_NOOUTPUT;
    } else {
        /*
         *    Put the real service in rs_table if not present.
         *    For now only for NAT!
         */
        ip_vs_rs_hash(ipvs, dest);
    }
    atomic_set(&dest->conn_flags, conn_flags);

    /* bind the service */
    old_svc = rcu_dereference_protected(dest->svc, 1);
    if (!old_svc) {
        __ip_vs_bind_svc(dest, svc);
    } else {
        if (old_svc != svc) {
            ip_vs_zero_stats(&dest->stats);
            __ip_vs_bind_svc(dest, svc);
            __ip_vs_svc_put(old_svc, true);
        }
    }

    /* set the dest status flags */
    dest->flags |= IP_VS_DEST_F_AVAILABLE;

    if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
    dest->u_threshold = udest->u_threshold;
    dest->l_threshold = udest->l_threshold;

    dest->af = udest->af;

    spin_lock_bh(&dest->dst_lock);
    __ip_vs_dst_cache_reset(dest);
    spin_unlock_bh(&dest->dst_lock);

    if (add) {
        ip_vs_start_estimator(svc->net, &dest->stats);
       将dest添加到svc的双向循环链表destinations中。
        list_add_rcu(&dest->n_list, &svc->destinations);
        svc->num_dests++;
        sched = rcu_dereference_protected(svc->scheduler, 1);
        调用sched的 add_dest 将重新分配dest，比如对于sh调度器来说，add_dest 为ip_vs_sh_dest_changed
        if (sched && sched->add_dest)
            sched->add_dest(svc, dest);
    } else {
        sched = rcu_dereference_protected(svc->scheduler, 1);
        if (sched && sched->upd_dest)
            sched->upd_dest(svc, dest);
    }
}

ip_vs_sh_state 为sh调度器的私有数据结构，buckets用来保存一个dest，IP_VS_SH_TAB_SIZE为256，所以对于sh来说，一共只能保存256个dest，即rs。
ip_vs_sh_reassign会将svc->destinations链表中的dest赋值到buckets->dest，但是buckets容量只有256，即使添加再多的dest，也只有最后添加的256个dest生效，这是在dest的weight为1的情况下的结果。
如果dest的weight越大，则此dest会占用weight个数的buckets。
struct ip_vs_sh_state {
    struct rcu_head         rcu_head;
    struct ip_vs_sh_bucket      buckets[IP_VS_SH_TAB_SIZE];
};
static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
                 struct ip_vs_dest *dest)
{
    struct ip_vs_sh_state *s = svc->sched_data;

    /* assign the hash buckets with the updated service */
    ip_vs_sh_reassign(s, svc);

    return 0;
}
/*
 *      Assign all the hash buckets of the specified table with the service.
 */
static int
ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
    int i;
    struct ip_vs_sh_bucket *b;
    struct list_head *p;
    struct ip_vs_dest *dest;
    int d_count;
    bool empty;

    b = &s->buckets[0];
    p = &svc->destinations;
    empty = list_empty(p);
    d_count = 0;
    for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
        dest = rcu_dereference_protected(b->dest, 1);
        if (dest)
            ip_vs_dest_put(dest);
        if (empty)
            RCU_INIT_POINTER(b->dest, NULL);
        else {
            svc->destinations 为双向循环链表
            if (p == &svc->destinations)
                p = p->next;

            dest = list_entry(p, struct ip_vs_dest, n_list);
            ip_vs_dest_hold(dest);
            RCU_INIT_POINTER(b->dest, dest);

            IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
                      i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
                      atomic_read(&dest->weight));

            /* Don't move to next dest until filling weight */
            if (++d_count >= atomic_read(&dest->weight)) {
                p = p->next;
                d_count = 0;
            }
        }
        b++;
    }
    return 0;
}

如果转发模式为 DR或者tunnel，则添加的dest的port会自动转换成svc的port，可参考下面ipvsadm命令行部分的代码

static int process_options(int argc, char **argv, int reading_stdin)
        /*
         * The destination port must be equal to the service port
         * if the IP_VS_CONN_F_TUNNEL or IP_VS_CONN_F_DROUTE is set.
         * Don't worry about this if fwmark is used.
         */
        if (!ce.svc.fwmark &&
            (fwd_method == IP_VS_CONN_F_TUNNEL ||
             fwd_method == IP_VS_CONN_F_DROUTE))
            ce.dest.port = ce.svc.port;

[root@test1 ipvsadm-1.31]# ./ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:12 -g
[root@test1 ipvsadm-1.31]# ./ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:13 -g
Destination already exists

DR模式下的数据转发

假设添加了如下svc和rs：
// -s sh指定调度器为sh，source hash
ipvsadm -A -t 1.1.1.10:8080 -s sh
// -g 指定转发模式为 DR
ipvsadm -a -t 1.1.1.10:8080 -r 1.1.1.3:12 -g

client请求server的数据流为: 2.2.2.1:4444->1.1.1.10:8080
在local in hook点上，会经过ip_vs_reply4和ip_vs_remote_request4的处理。ip_vs_reply4会调用ip_vs_out(ops->hooknum, skb, AF_INET);，这是对从本机发出去的报文的处理，先不看它。

重点看ip_vs_remote_request4的流程，如下

static unsigned int
ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
              const struct net_device *in,
              const struct net_device *out,
              int (*okfn)(struct sk_buff *))
{
    return ip_vs_in(ops->hooknum, skb, AF_INET);
}

static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
    struct net *net;
    struct ip_vs_iphdr iph;
    struct ip_vs_protocol *pp;
    struct ip_vs_proto_data *pd;
    struct ip_vs_conn *cp;
    int ret, pkts;
    struct netns_ipvs *ipvs;

    如果经过ipvs处理了就直接返回
    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;

    /*
     *  Big tappo:
     *  - remote client: only PACKET_HOST
     *  - route: used for struct net when skb->dev is unset
     */
    if (unlikely((skb->pkt_type != PACKET_HOST &&
              hooknum != NF_INET_LOCAL_OUT) ||
             !skb_dst(skb))) {
        ip_vs_fill_iph_skb(af, skb, &iph);
        IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
                  " ignored in hook %u\n",
                  skb->pkt_type, iph.protocol,
                  IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
        return NF_ACCEPT;
    }
    /* ipvs enabled in this netns ? */
    net = skb_net(skb);
    ipvs = net_ipvs(net);
    if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
        return NF_ACCEPT;
    获取ip头信息
    ip_vs_fill_iph_skb(af, skb, &iph);

    ...

    根据protocol获取protocol data，进而获取pp，pp为协议相关操作集合
    /* Protocol supported? */
    pd = ip_vs_proto_data_get(net, iph.protocol);
    if (unlikely(!pd))
        return NF_ACCEPT;
    pp = pd->pp;
    /*
     * Check if the packet belongs to an existing connection entry
     */
    首先根据caddr和cport到连接表ip_vs_conn_tab查找是否已经存在，如果已经存 
    在走快速路径，调用packet_xmit转发数据，
    否则走慢速路径来查找svc，让调度器选择合适的rs，并创建新的连接，绑定转发 
    函数到packet_xmit，最后也会调用packet_xmit转发数据
    cp = pp->conn_in_get(af, skb, &iph, 0);
    ...
    if (unlikely(!cp) && !iph.fragoffs) {
        /* No (second) fragments need to enter here, as nf_defrag_ipv6
         * replayed fragment zero will already have created the cp
         */
        int v;
     1. 调用协议相关的conn_schedule函数(对于tcp来说，tcp_conn_schedule)，根据 
        目的ip和port查找是否有匹配的svc，如果找到了，再调用sched->schedule(对 
        于sh调度器来说，ip_vs_sh_schedule)找出合适的rs，创建ip_vs_conn_new
        /* Schedule and create new connection entry into &cp */
        if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
            return v;
    }

    ...

    ip_vs_in_stats(cp, skb);
    ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
    2. 调用绑定的转发函数
    if (cp->packet_xmit)
        ret = cp->packet_xmit(skb, cp, pp, &iph);
        /* do not touch skb anymore */
    else {
        IP_VS_DBG_RL("warning: packet_xmit is null");
        ret = NF_ACCEPT;
         }
}

1 pp->conn_schedule 对于tcp来说为 tcp_conn_schedule

static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
          int *verdict, struct ip_vs_conn **cpp,
          struct ip_vs_iphdr *iph)
{
    struct net *net;
    struct ip_vs_service *svc;
    struct tcphdr _tcph, *th;
    struct netns_ipvs *ipvs;
    获取四层头
    th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
    if (th == NULL) {
        *verdict = NF_DROP;
        return 0;
    }
    net = skb_net(skb);
    ipvs = net_ipvs(net);
    /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
    rcu_read_lock();
    根据报文中的目的ip和port，查找是否有匹配的svc
    if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
        (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
                      &iph->daddr, th->dest))) {
        int ignored;

        if (ip_vs_todrop(ipvs)) {
            /*
             * It seems that we are very loaded.
             * We have to drop this packet :(
             */
            rcu_read_unlock();
            *verdict = NF_DROP;
            return 0;
        }

        /*
         * Let the virtual server select a real server for the
         * incoming connection, and create a connection entry.
         */
        *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
        if (!*cpp && ignored <= 0) {
            if (!ignored)
                *verdict = ip_vs_leave(svc, skb, pd, iph);
            else
                *verdict = NF_DROP;
            rcu_read_unlock();
            return 0;
        }
    }
    rcu_read_unlock();
    /* NF_ACCEPT */
    return 1;
}

struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
           struct ip_vs_proto_data *pd, int *ignored,
           struct ip_vs_iphdr *iph)
{
    struct ip_vs_protocol *pp = pd->pp;
    struct ip_vs_conn *cp = NULL;
    struct ip_vs_scheduler *sched;
    struct ip_vs_dest *dest;
    __be16 _ports[2], *pptr;
    unsigned int flags;

    *ignored = 1;
    /*
     * IPv6 frags, only the first hit here.
     */
    pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
    if (pptr == NULL)
        return NULL;
    ...
    ...
    sched = rcu_dereference(svc->scheduler);
    if (sched) {
        /* read svc->sched_data after svc->scheduler */
        smp_rmb();
        调用调度器的schedule找到合适的dest，对于sh调度器来说调用的是ip_vs_sh_schedule
        dest = sched->schedule(svc, skb, iph);
    } else {
        dest = NULL;
    }
    if (dest == NULL) {
        IP_VS_DBG(1, "Schedule: no dest found.\n");
        return NULL;
    }

    flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
         && iph->protocol == IPPROTO_UDP) ?
        IP_VS_CONN_F_ONE_PACKET : 0;

    /*
     *    Create a connection entry.
     */
    {
        struct ip_vs_conn_param p;

        ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
                      &iph->saddr, pptr[0], &iph->daddr,
                      pptr[1], &p);
        创建新的连接
        cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
                    dest->port ? dest->port : pptr[1],
                    flags, dest, skb->mark);
        if (!cp) {
            *ignored = -1;
            return NULL;
        }
    }

    IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
              "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
              ip_vs_fwd_tag(cp),
              IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
              IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
              IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
              cp->flags, atomic_read(&cp->refcnt));

    ip_vs_conn_stats(cp, svc);
    return cp;
}

1.1 调度函数，寻找合适的rs

static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
          struct ip_vs_iphdr *iph)
{
    struct ip_vs_dest *dest;
    struct ip_vs_sh_state *s;
    __be16 port = 0;

    IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");

    if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
        port = ip_vs_sh_get_port(skb, iph);

    s = (struct ip_vs_sh_state *) svc->sched_data;

    if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
        dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
    else
        dest = ip_vs_sh_get(svc, s, &iph->saddr, port);

    if (!dest) {
        ip_vs_scheduler_err(svc, "no destination available");
        return NULL;
    }

    IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
              IP_VS_DBG_ADDR(svc->af, &iph->saddr),
              IP_VS_DBG_ADDR(dest->af, &dest->addr),
              ntohs(dest->port));

    return dest;
}

static inline struct ip_vs_dest *
ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
         const union nf_inet_addr *addr, __be16 port)
{
    unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
    struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);

    return (!dest || is_unavailable(dest)) ? NULL : dest;
}

1.2 创建新连接
创建一个新的连接项，通过caddr和cport计算出hash值，并将它加入到hash表ip_vs_conn_tab。

连接项里包含client的信息caddr和cport，service的信息vaddr和vport，真实server的信息daddr和dport。

根据转发模式，绑定不同的函数。

/*
 *  Create a new connection entry and hash it into the ip_vs_conn_tab
 */
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
           const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
           struct ip_vs_dest *dest, __u32 fwmark)
{
    struct ip_vs_conn *cp;
    struct netns_ipvs *ipvs = net_ipvs(p->net);
    struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
                               p->protocol);
    INIT_HLIST_NODE(&cp->c_list);
    setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
    ip_vs_conn_net_set(cp, p->net);
    cp->af         = p->af;
    cp->daf        = dest_af;
    cp->protocol       = p->protocol;
    ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
    cp->cport      = p->cport;
    /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
    ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
               &cp->vaddr, p->vaddr);
    cp->vport      = p->vport;
    ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
    cp->dport          = dport;
    cp->flags      = flags;
    cp->fwmark         = fwmark;
    if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
        ip_vs_pe_get(p->pe);
        cp->pe = p->pe;
        cp->pe_data = p->pe_data;
        cp->pe_data_len = p->pe_data_len;
    } else {
        cp->pe = NULL;
        cp->pe_data = NULL;
        cp->pe_data_len = 0;
    }
    spin_lock_init(&cp->lock);

    /*
     * Set the entry is referenced by the current thread before hashing
     * it in the table, so that other thread run ip_vs_random_dropentry
     * but cannot drop this entry.
     */
    atomic_set(&cp->refcnt, 1);

    cp->control = NULL;
    atomic_set(&cp->n_control, 0);
    atomic_set(&cp->in_pkts, 0);

    cp->packet_xmit = NULL;
    cp->app = NULL;
    cp->app_data = NULL;
    /* reset struct ip_vs_seq */
    cp->in_seq.delta = 0;
    cp->out_seq.delta = 0;

    atomic_inc(&ipvs->conn_count);
    if (flags & IP_VS_CONN_F_NO_CPORT)
        atomic_inc(&ip_vs_conn_no_cport_cnt);

    /* Bind the connection with a destination server */
    cp->dest = NULL;
    ip_vs_bind_dest(cp, dest);

    /* Set its state and timeout */
    cp->state = 0;
    cp->old_state = 0;
    cp->timeout = 3*HZ;
    cp->sync_endtime = jiffies & ~3UL;

    /* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
    if (p->af == AF_INET6)
        ip_vs_bind_xmit_v6(cp);
    else
#endif
        根据转发模式，绑定不同的函数
        ip_vs_bind_xmit(cp);

    if (unlikely(pd && atomic_read(&pd->appcnt)))
        ip_vs_bind_app(cp, pd->pp);

    /*
     * Allow conntrack to be preserved. By default, conntrack
     * is created and destroyed for every packet.
     * Sometimes keeping conntrack can be useful for
     * IP_VS_CONN_F_ONE_PACKET too.
     */

    if (ip_vs_conntrack_enabled(ipvs))
        cp->flags |= IP_VS_CONN_F_NFCT;

    /* Hash it in the ip_vs_conn_tab finally */
    ip_vs_conn_hash(cp);

    return cp;
}
cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);

创建连接时，根据转发模式绑定相应的转发函数

/*
 *  Bind a connection entry with the corresponding packet_xmit.
 *  Called by ip_vs_conn_new.
 */
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
    switch (IP_VS_FWD_METHOD(cp)) {
    case IP_VS_CONN_F_MASQ:
        cp->packet_xmit = ip_vs_nat_xmit;
        break;

    case IP_VS_CONN_F_TUNNEL:
#ifdef CONFIG_IP_VS_IPV6
        if (cp->daf == AF_INET6)
            cp->packet_xmit = ip_vs_tunnel_xmit_v6;
        else
#endif
            cp->packet_xmit = ip_vs_tunnel_xmit;
        break;

    case IP_VS_CONN_F_DROUTE:
        cp->packet_xmit = ip_vs_dr_xmit;
        break;

    case IP_VS_CONN_F_LOCALNODE:
        cp->packet_xmit = ip_vs_null_xmit;
        break;

    case IP_VS_CONN_F_BYPASS:
        cp->packet_xmit = ip_vs_bypass_xmit;
        break;
    }
}

DR模式下的转发函数 ip_vs_dr_xmit

int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
          struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    int local;

    EnterFunction(10);

    rcu_read_lock();
    根据 cp->daddr.ip 查找路由，而不是根据skb中的目的ip(vip)
    local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
                   IP_VS_RT_MODE_LOCAL |
                   IP_VS_RT_MODE_NON_LOCAL |
                   IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
    if (local < 0)
        goto tx_error;
    if (local) {
        rcu_read_unlock();
       如果路由结果为local，则返回accept即可
        return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
    }

    ip_send_check(ip_hdr(skb));

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;
     不是local，需要经过NF_INET_LOCAL_OUT处理发送出去
    ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
    rcu_read_unlock();

    LeaveFunction(10);
    return NF_STOLEN;

  tx_error:
    kfree_skb(skb);
    rcu_read_unlock();
    LeaveFunction(10);
    return NF_STOLEN;
}

static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
                     struct ip_vs_conn *cp, int local)
{
    int ret = NF_STOLEN;
   ipvs_property 置为1，表示已经经过ipvs处理
    skb->ipvs_property = 1;
    if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
        ip_vs_notrack(skb);
    if (!local) {
        ip_vs_drop_early_demux_sk(skb);
        skb_forward_csum(skb);
        NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
            dst_output);
    } else
        ret = NF_ACCEPT;
    return ret;
}

在NF_INET_LOCAL_OUT hook点上，还有另外两个ipvs的hook函数执行，但是因为skb->ipvs_property置位了，所以不会再次处理，返回accept。

ip_vs_local_reply4
ip_vs_out
    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;

ip_vs_local_request4
ip_vs_in
    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;

从hook函数返回后，调用dst_output，即调用ip_output->ip_finish_output2,

int ip_output(struct sock *sk, struct sk_buff *skb)
{
    struct net_device *dev = skb_dst(skb)->dev;

    IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);

    skb->dev = dev;
    skb->protocol = htons(ETH_P_IP);
经过NF_INET_POST_ROUTING hook点，也没特殊处理，直接通过后调用ip_finish_output
    return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
                ip_finish_output,
                !(IPCB(skb)->flags & IPSKB_REROUTED));
}
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
    /* Policy lookup after SNAT yielded a new policy */
    if (skb_dst(skb)->xfrm != NULL) {
        IPCB(skb)->flags |= IPSKB_REROUTED;
        return dst_output(skb);
    }
#endif
    if (skb_is_gso(skb))
        return ip_finish_output_gso(skb);

    if (skb->len > ip_skb_dst_mtu(skb))
        return ip_fragment(skb, ip_finish_output2);

    return ip_finish_output2(skb);
}
ip_finish_output->ip_finish_output2调用邻居子系统函数填充mac地址，此时才是重点，需要将源mac设置为lb mac，目的mac设置为rs mac。因为之前查找路由时，就是使用的rs 地址，而不是数据包中vip地址。
所以路由表项rt->rt_gateway的值为: 如果是同网段的，则rt->rt_gateway为目的rs地址，如果是不同网段的，则rt->rt_gateway为网关地址。因为LB和RS在同网段，所以rt->rt_gateway就是RS的地址。
static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
{
    if (rt->rt_gateway)
        return rt->rt_gateway;
    return daddr;
}
//根据nexthop获取neigh信息，如果neigh为空，则需创建新的neigh。
//有了rs地址的mac地址，才会将报文发出去。
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
    neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
    if (unlikely(!neigh))
            neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
                      tbl->constructor(n) //arp_constructor
                      neigh->ops = &arp_hh_ops;
                      neigh->output = neigh->ops->output; //neigh_resolve_output
    if (!IS_ERR(neigh)) {
        int res = dst_neigh_output(dst, neigh, skb);

dst_neigh_output
      //如果arp状态是正常的，则调用 dev_queue_xmit 发送出去
      if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
        return neigh_hh_output(hh, skb);
    else
        //在neigh_resolve_output中，缓存数据包到arp_queue队列，发送arp请求，
       //等到接收到arp回应后，再发送出去
        return n->output(n, skb); //neigh_resolve_output

DR模式转发总结
a. DR模式下，LB转发数据给RS时，不会修改ip层信息，只修改目的mac为RS的 mac，所以LB和RS必须在同一网段。
b. LB和RS都需要接收目的ip为vip的数据包，所以LB和RS上都需要配置vip。
LB上一般通过keepalived实现主备两个LB，vip可以在两个LB上漂移。
RS上的vip需要配置在lo接口上(掩码为32)，并且设置RS上所有网卡如下配置
arp_ignore=1, -只响应请求地址为接收网卡上地址的arp请求
arp_announce=2 -忽略数据包中源ip，选择发送网卡上合适的ip作为arp请求的源ip
c. client的请求需要经过LB转发到RS，但是响应报文不会经过LB，所以需要保证RS和client网络可达，而且RS的网关不能指向LB。
d. 不支持端口映射，所以命令行添加rs时不用指定port，即使指定了也会自动转换成svc的port。

由上面代码可知，从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook，hook点上分别有两个hook函数：ip_vs_reply4和ip_vs_remote_request4，ip_vs_local_reply4和ip_vs_local_request4，但是真正处理的只有local in上的ip_vs_remote_request4。

TUNNEL转发模式

[http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.LVS-Tun.html]

If you want to try a test LVS-Tun setup on the bench, take a standard LVS-DR setup LVS-DR example, change lo on the realservers to tunl0 (and handle the ARP problem on tunl0) and change the ipvsadm switch from -g to -i . If your clients are going to be sending large packets, you need to set the MTU (see MTU for the ipip packet DIP->RIP). This can be done on the realserver with iptables (see tunl MTU solved) or iproute2 (see setting the MTU by route).
上面这段话的意思是如果想搭建个lvs-tunnel的实验环境，可以在lvs-dr环境上稍作修改即可。vip从lo改成配置在tunl0上，ipvsadm添加rs时，转发模式从-i改成-g。

In LVS-Tun, the tunl0 device holds the VIP, just as the lo device holds the device for LVS-DR. You need to build the tunl0 device into the Linux kernel (in networking options - IP:tunneling) - it is turned off by default. The tunnelling (ipip) can be built as a module, in which case you'll have to insmod ipip before you can use it, or you can build ipip directly into the kernel. With a kernel enabled for ipip, you should be able to see the unconfigured tunl0 device with ifconfig or with ip addr show (Feb 2004 - my ifconfig used to see the unconfigured tunl0, but it doesn't anymore.)
上面这段话的意思是，如果ipip模块编译进kernel了，就会自动生成一个虚拟网卡设备tunl0，如果没有编译进内核，需要使用modprobe ipip加载一下这个模块，这样也会生成tunl0设备。如下所示

[root@test1 ~]# modprobe ipip
[root@test1 ~]# lsmod | grep ipip
ipip                   16384  0
tunnel4                16384  1 ipip
ip_tunnel              24576  1 ipip
[root@test1 ~]# ip a
...
4: tunl0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN group default qlen 1000
    link/ipip 0.0.0.0 brd 0.0.0.0

Then you configure the tunl0 device
ifconfig tunl0 192.168.1.110 netmask 255.255.255.255 broadcast 192.168.1.110
or
ip addr add dev tunl0 192.168.1.110/32 brd 192.168.1.110
Note
the VIP is a /32 addr, so the brd addr is the VIP, not x.x.x.255.

LB端隧道报文封装不依赖其他module，在 ip_vs_tunnel_xmit 中直接封装成ipip包发出去。这也就限制了只支持ipip隧道。
RS接收端需要提前加载ipip module，来解封装ipip报文。处理完后，将响应报文直接发给client，不用封装，也不用经过LB。

在LB上的代码处理流程和lvs-dr大部分是一样的，从client发送的报文在LB上会经过NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook，hook点上分别有两个hook函数：ip_vs_reply4和ip_vs_remote_request4，ip_vs_local_reply4和ip_vs_local_request4，但是真正处理的只有local in上的ip_vs_remote_request4，只有最后调用转发函数时，ip_vs_dr_xmit变成了ip_vs_tunnel_xmit。

/*
 *   IP Tunneling transmitter
 *
 *   This function encapsulates the packet in a new IP packet, its
 *   destination will be set to cp->daddr. Most code of this function
 *   is taken from ipip.c.
 *
 *   It is used in VS/TUN cluster. The load balancer selects a real
 *   server from a cluster based on a scheduling algorithm,
 *   encapsulates the request packet and forwards it to the selected
 *   server. For example, all real servers are configured with
 *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
 *   the encapsulated packet, it will decapsulate the packet, processe
 *   the request and return the response packets directly to the client
 *   without passing the load balancer. This can greatly increase the
 *   scalability of virtual server.
 *
 *   Used for ANY protocol
 */
int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
          struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
    struct rtable *rt;          /* Route to the other host */
    __be32 saddr;               /* Source for tunnel */
    struct net_device *tdev;        /* Device to other host */
    __u8 next_protocol = 0;
    __u8 dsfield = 0;
    __u8 ttl = 0;
    __be16 df = 0;
    __be16 *dfp = NULL;
    struct iphdr  *iph;         /* Our new IP header */
    unsigned int max_headroom;      /* The extra header space needed */
    int ret, local;

    EnterFunction(10);

    rcu_read_lock();
    根据cp->daddr查找路由，注意daddr是调度器选出的rs的ip，不是数据包中的vip
    local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
                   IP_VS_RT_MODE_LOCAL |
                   IP_VS_RT_MODE_NON_LOCAL |
                   IP_VS_RT_MODE_CONNECT |
                   IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
    if (local < 0)
        goto tx_error;
    如果查找结果是local，则返回accept
    if (local) {
        rcu_read_unlock();
        return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
    }
    取出路由结果中的出口设备
    rt = skb_rtable(skb);
    tdev = rt->dst.dev;

    /*
     * Okay, now see if we can stuff it in the buffer as-is.
     */
    计算出口设备可容纳的最大长度
    max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);

    /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
    dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
    先计算下max_headroom能不能容纳封装后报文长度，同时取出ip头里的field
    skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
                     &next_protocol, NULL, &dsfield,
                     &ttl, dfp);
    if (IS_ERR(skb))
        goto tx_error;

    skb = iptunnel_handle_offloads(
        skb, false, __tun_gso_type_mask(AF_INET, cp->af));
    if (IS_ERR(skb))
        goto tx_error;

    skb->transport_header = skb->network_header;
    在skb ip头前面腾出一个ip头的长度，用来封装外层ip
    skb_push(skb, sizeof(struct iphdr));
    skb_reset_network_header(skb);
    memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
    外层ip头赋值，saddr为查找路由后给定，daddr为cp->daddr，即rs的ip
    /*
     *  Push down and install the IPIP header.
     */
    iph         =   ip_hdr(skb);
    iph->version        =   4;
    iph->ihl        =   sizeof(struct iphdr)>>2;
    iph->frag_off       =   df;
    iph->protocol       =   next_protocol;
    iph->tos        =   dsfield;
    iph->daddr      =   cp->daddr.ip;
    iph->saddr      =   saddr;
    iph->ttl        =   ttl;
    ip_select_ident(skb, NULL);

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;

    ret = ip_vs_tunnel_xmit_prepare(skb, cp);
    if (ret == NF_ACCEPT)
        发给LOCAL_OUT hook点处理，此时的报文已经封装成ipip报文，外层ip是LB 
        和rs的隧道ip，内层ip是client和vip。
        ip_local_out(skb);
    else if (ret == NF_DROP)
        kfree_skb(skb);
    rcu_read_unlock();

    LeaveFunction(10);

    return NF_STOLEN;

  tx_error:
    if (!IS_ERR(skb))
        kfree_skb(skb);
    rcu_read_unlock();
    LeaveFunction(10);
    return NF_STOLEN;
}

DNAT转发模式

int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
           struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    struct rtable *rt;      /* Route to the other host */
    int local, rc, was_input;

    EnterFunction(10);

    rcu_read_lock();
    /* check if it is a connection of no-client-port */
    if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
        __be16 _pt, *p;

        p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
        if (p == NULL)
            goto tx_error;
        ip_vs_conn_fill_cport(cp, *p);
        IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
    }

    was_input = rt_is_input_route(skb_rtable(skb));
根据 cp->daddr.ip 查找路由，而不是根据skb中的目的ip(vip)
    local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
                   IP_VS_RT_MODE_LOCAL |
                   IP_VS_RT_MODE_NON_LOCAL |
                   IP_VS_RT_MODE_RDR, NULL, ipvsh);
    if (local < 0)
        goto tx_error;
    rt = skb_rtable(skb);
    ...
    调用协议相关的 tcp_dnat_handler，修改数据包的目的port为cp->dport
    /* mangle the packet */
    if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
        goto tx_error;
    修改目的ip为 cp->daddr.ip
    ip_hdr(skb)->daddr = cp->daddr.ip;
    重新计算校验和
    ip_send_check(ip_hdr(skb));

    IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");

    /* FIXME: when application helper enlarges the packet and the length
       is larger than the MTU of outgoing device, there will be still
       MTU problem. */

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;

    rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
    rcu_read_unlock();

    LeaveFunction(10);
    return rc;

  tx_error:
    kfree_skb(skb);
    rcu_read_unlock();
    LeaveFunction(10);
    return NF_STOLEN;
}

static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
                     struct ip_vs_conn *cp, int local)
{
    int ret = NF_STOLEN;

    skb->ipvs_property = 1;
    if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
        ip_vs_notrack(skb);
    else
        ip_vs_update_conntrack(skb, cp, 1);

    /* Remove the early_demux association unless it's bound for the
     * exact same port and address on this host after translation.
     */
    if (!local || cp->vport != cp->dport ||
        !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
        ip_vs_drop_early_demux_sk(skb);

    if (!local) {
        skb_forward_csum(skb);
        同样的，将dnat后的数据包调用local out发送出去
        NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
            dst_output);
    } else
        ret = NF_ACCEPT;

    return ret;
}

所以到达lb的数据流为：cip:cport->vip:vport,
经过dnat后的数据流为：cip:cport->rip:rport.
rs处理完后的响应数据流为:rip:rport->cip:cport，需要将rip：rport还原成vip:vport，所以此数据流必须发给lb做snat。又因为目的ip不是lb的ip，所以必须将rs的默认网关指向lb。当数据流到达lb后，查找路由表发现目的ip不是lb的ip，所以需要转发此数据包(必须保证net.ipv4.ip_forward = 1)，将走ip_forward函数转发，函数最后需要经过NF_INET_FORWARD hook点的处理，

NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
               rt->dst.dev, ip_forward_finish);

此hook点注册了两个和ipvs相关的函数ip_vs_forward_icmp和ip_vs_reply4，很显然前一个是处理icmp的，重点是ip_vs_reply4。

static unsigned int
ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
         const struct net_device *in, const struct net_device *out,
         int (*okfn)(struct sk_buff *))
{
    return ip_vs_out(ops->hooknum, skb, AF_INET);
}
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
    struct net *net = NULL;
    struct ip_vs_iphdr iph;
    struct ip_vs_protocol *pp;
    struct ip_vs_proto_data *pd;
    struct ip_vs_conn *cp;

    EnterFunction(11);

    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;

    /* Bad... Do not break raw sockets */
    if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
             af == AF_INET)) {
        struct sock *sk = skb->sk;
        struct inet_sock *inet = inet_sk(skb->sk);

        if (inet && sk->sk_family == PF_INET && inet->nodefrag)
            return NF_ACCEPT;
    }

    if (unlikely(!skb_dst(skb)))
        return NF_ACCEPT;

    net = skb_net(skb);
    if (!net_ipvs(net)->enable)
        return NF_ACCEPT;

    ip_vs_fill_iph_skb(af, skb, &iph);
#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET6) {
        if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
            int related;
            int verdict = ip_vs_out_icmp_v6(skb, &related,
                            hooknum, &iph);

            if (related)
                return verdict;
        }
    } else
#endif
        if (unlikely(iph.protocol == IPPROTO_ICMP)) {
            int related;
            int verdict = ip_vs_out_icmp(skb, &related, hooknum);

            if (related)
                return verdict;
        }

    pd = ip_vs_proto_data_get(net, iph.protocol);
    if (unlikely(!pd))
        return NF_ACCEPT;
    pp = pd->pp;

    /* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET)
#endif
        if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
            if (ip_vs_gather_frags(skb,
                           ip_vs_defrag_user(hooknum)))
                return NF_STOLEN;

            ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
        }

    /*
     * Check if the packet belongs to an existing entry
     */
    因为从client到rs是通过cip和cport创建的连接表，所以反方向是通过
    目的ip和port(也就是cip和cport)查找是否有连接表
    cp = pp->conn_out_get(af, skb, &iph, 0);
    如果查找到连接表，才需要处理
    if (likely(cp))
        return handle_response(af, skb, pd, cp, &iph);
    ...
}

static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
        struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
    struct ip_vs_protocol *pp = pd->pp;

    IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");

    if (!skb_make_writable(skb, iph->len))
        goto drop;

    /* mangle the packet */
    调用协议相关的 snat_handler 处理数据包，即 tcp_snat_handler
    将源port换成vport
    if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
        goto drop;

    {
        修改源ip为vaddr
        ip_hdr(skb)->saddr = cp->vaddr.ip;
        ip_send_check(ip_hdr(skb));
    }

    /*
     * nf_iterate does not expect change in the skb->dst->dev.
     * It looks like it is not fatal to enable this code for hooks
     * where our handlers are at the end of the chain list and
     * when all next handlers use skb->dst->dev and not outdev.
     * It will definitely route properly the inout NAT traffic
     * when multiple paths are used.
     */

    /* For policy routing, packets originating from this
     * machine itself may be routed differently to packets
     * passing through.  We want this packet to be routed as
     * if it came from this machine itself.  So re-compute
     * the routing information.
     */
    if (ip_vs_route_me_harder(af, skb))
        goto drop;

    IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");

    ip_vs_out_stats(cp, skb);
    ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
    skb->ipvs_property = 1;
    if (!(cp->flags & IP_VS_CONN_F_NFCT))
        ip_vs_notrack(skb);
    else
        ip_vs_update_conntrack(skb, cp, 0);
    ip_vs_conn_put(cp);

    LeaveFunction(11);
    最后返回accept即可，从hook函数返回后，会调用ip_forward_finish最终发给client端
    return NF_ACCEPT;

drop:
    ip_vs_conn_put(cp);
    kfree_skb(skb);
    LeaveFunction(11);
    return NF_STOLEN;
}

dnat转发总结：
RS的网关必须指向DIP
请求和响应报文都需要经过LB，流量很大的话，LB会成为瓶颈
支持端口映射

参考文档

http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/

负载均衡集群介绍、LVS NAT模式搭建、LVS DR模式搭建、
负载均衡集群介绍负载均衡集群介绍 LVS介绍 LVS介绍LVS介绍LVS介绍 LVS的调度算法 LVS的调度算法...
负载均衡集群介绍、LVS介绍、LVS调度算法、LVS NAT模式
目录一、负载均衡集群介绍二、LVS介绍三、LVS调度算法四、LVS NAT模式搭建一、负载均衡集群介绍负载均...
负载均衡-LVS
负载均衡lvs by shihang.mai 负载均衡层次负载均衡拓扑负载均衡-D-NAT模式 Client发...
LVS实现负载均衡原理及安装配置详解
一、负载均衡LVS基本介绍二、LVS的基本工作原理 1. 当用户向负载均衡调度器（Director Server...
lvs
nginx,haproxy,lvs 的比较共同点:都能实现负载均衡不同点负载均衡vs反向代理 lvs介绍 L...
使用LVS实现负载均衡原理及安装配置详解
一、负载均衡LVS基本介绍二、LVS的基本工作原理 1. 当用户向负载均衡调度器（Director Server...
7层负载均衡知识2018-08-21
负载均衡集群LBload balance 提高负载，提高并发量软件：nginx反向代理 lvs 硬件负载均衡器...
服务器优化----Tomcat优化
Tomcat优化，配置优化 LVS四层负载均衡 LVS + Keepalived高可用
02_高并发_03_Keepalived高可用
02_高并发_02_LVS负载均衡这一节做的LVS负载均衡有以下几个缺点: 简介 Keepalived的作用是检测...
Nginx-负载均衡
章节目录什么是负载均衡使用负载均衡要解决的问题基于LVS的中间件架构GSLBSLB四层负载均衡和七层负载均衡...