Rework kube-proxy into LB plugin

Add support of NAT66

Change-Id: Ie6aa79078a3835f989829b9a597c448dfd2f9ea3
Signed-off-by: Hongjun Ni <hongjun.ni@intel.com>
This commit is contained in:
Hongjun Ni
2018-02-06 23:00:22 +08:00
committed by Damjan Marion
parent afe56de947
commit d92a0b553f
22 changed files with 1846 additions and 4271 deletions

View File

@ -229,7 +229,6 @@ PLUGIN_ENABLED(igmp)
PLUGIN_ENABLED(ila)
PLUGIN_ENABLED(ioam)
PLUGIN_ENABLED(ixge)
PLUGIN_ENABLED(kubeproxy)
PLUGIN_ENABLED(l2e)
PLUGIN_ENABLED(lacp)
PLUGIN_ENABLED(lb)

View File

@ -75,10 +75,6 @@ if ENABLE_IXGE_PLUGIN
include ixge.am
endif
if ENABLE_KUBEPROXY_PLUGIN
include kubeproxy.am
endif
if ENABLE_LACP_PLUGIN
include lacp.am
endif

View File

@ -1,38 +0,0 @@
# Copyright (c) 2017 Intel Corporation, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
vppapitestplugins_LTLIBRARIES += kubeproxy_test_plugin.la
vppplugins_LTLIBRARIES += kubeproxy_plugin.la
kubeproxy_plugin_la_SOURCES = \
kubeproxy/kp.c \
kubeproxy/kp_node.c \
kubeproxy/kp_cli.c \
kubeproxy/kp_api.c
BUILT_SOURCES += \
kubeproxy/kp.api.h \
kubeproxy/kp.api.json
API_FILES += kubeproxy/kp.api
noinst_HEADERS += \
kubeproxy/kp.h \
kubeproxy/kphash.h \
kubeproxy/kp.api.h
kubeproxy_test_plugin_la_SOURCES = \
kubeproxy/kp_test.c \
kubeproxy/kp_plugin.api.h
# vi:syntax=automake

View File

@ -1,81 +0,0 @@
/*
* Copyright (c) 2017 Intel and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
option version = "1.0.0";
/** \brief Configure Kube-proxy global parameters
@param client_index - opaque cookie to identify the sender
@param context - sender context, to match reply w/ request
@param sticky_buckets_per_core - Number of buckets *per worker thread* in the
established flow table (must be power of 2).
@param flow_timeout - Time in seconds after which, if no packet is received
for a given flow, the flow is removed from the established flow table.
*/
autoreply define kp_conf
{
u32 client_index;
u32 context;
u32 sticky_buckets_per_core;
u32 flow_timeout;
};
/** \brief Add a virtual address (or prefix)
@param client_index - opaque cookie to identify the sender
@param context - sender context, to match reply w/ request
@param ip_prefix - IP address (IPv4 in lower order 32 bits).
@param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4).
@param is_ipv6 - Is IPv6 addresss.
@param port - service port;
@param target_port - Pod's port corresponding to specific service.
@param node_port - Node's port.
@param is_nat4 - DNAT is NAT44 (NAT64 otherwise).
@param new_flows_table_length - Size of the new connections flow table used
for this VIP (must be power of 2).
@param is_del - The VIP should be removed.
*/
autoreply define kp_add_del_vip {
u32 client_index;
u32 context;
u8 ip_prefix[16];
u8 prefix_length;
u8 is_ipv6;
u16 port;
u16 target_port;
u16 node_port;
u8 is_nat4;
u32 new_flows_table_length;
u8 is_del;
};
/** \brief Add a pod for a given VIP
@param client_index - opaque cookie to identify the sender
@param context - sender context, to match reply w/ request
@param vip_ip_prefix - VIP IP address (IPv4 in lower order 32 bits).
@param vip_ip_prefix - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4).
@param vip_is_ipv6 - VIP is IPv6 addresss.
@param pod_address - The pod's IP address (IPv4 in lower order 32 bits).
@param pod_is_ipv6 - Pod is IPv6 addresss.
@param is_del - The Pod should be removed.
*/
autoreply define kp_add_del_pod {
u32 client_index;
u32 context;
u8 vip_ip_prefix[16];
u8 vip_prefix_length;
u8 vip_is_ipv6;
u8 pod_address[16];
u8 pod_is_ipv6;
u8 is_del;
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,249 +0,0 @@
/*
* Copyright (c) 2016 Intel and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "POD IS" BPODIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <kubeproxy/kp.h>
#include <vppinfra/byte_order.h>
#include <vlibapi/api.h>
#include <vlibmemory/api.h>
#define vl_msg_id(n,h) n,
typedef enum {
#include <kubeproxy/kp.api.h>
/* We'll want to know how many messages IDs we need... */
VL_MSG_FIRST_AVAILABLE,
} vl_msg_id_t;
#undef vl_msg_id
/* define message structures */
#define vl_typedefs
#include <kubeproxy/kp.api.h>
#undef vl_typedefs
/* define generated endian-swappers */
#define vl_endianfun
#include <kubeproxy/kp.api.h>
#undef vl_endianfun
#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
/* Get the API version number */
#define vl_api_version(n,v) static u32 api_version=(v);
#include <kubeproxy/kp.api.h>
#undef vl_api_version
#define vl_msg_name_crc_list
#include <kubeproxy/kp.api.h>
#undef vl_msg_name_crc_list
#define REPLY_MSG_ID_BASE kpm->msg_id_base
#include <vlibapi/api_helper_macros.h>
static void
setup_message_id_table (kp_main_t * kpm, api_main_t * am)
{
#define _(id,n,crc) \
vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + kpm->msg_id_base);
foreach_vl_msg_name_crc_kp;
#undef _
}
/* Macro to finish up custom dump fns */
#define FINISH \
vec_add1 (s, 0); \
vl_print (handle, (char *)s); \
vec_free (s); \
return handle;
static void
vl_api_kp_conf_t_handler
(vl_api_kp_conf_t * mp)
{
kp_main_t *kpm = &kp_main;
vl_api_kp_conf_reply_t * rmp;
int rv = 0;
rv = kp_conf(mp->sticky_buckets_per_core,
mp->flow_timeout);
REPLY_MACRO (VL_API_KP_CONF_REPLY);
}
static void *vl_api_kp_conf_t_print
(vl_api_kp_conf_t *mp, void * handle)
{
u8 * s;
s = format (0, "SCRIPT: kp_conf ");
s = format (s, "%u ", mp->sticky_buckets_per_core);
s = format (s, "%u ", mp->flow_timeout);
FINISH;
}
static void
vl_api_kp_add_del_vip_t_handler
(vl_api_kp_add_del_vip_t * mp)
{
kp_main_t *kpm = &kp_main;
vl_api_kp_conf_reply_t * rmp;
int rv = 0;
ip46_address_t prefix;
u8 prefix_length = mp->prefix_length;
if (mp->is_ipv6 == 0)
{
prefix_length += 96;
memcpy(&prefix.ip4, mp->ip_prefix, sizeof(prefix.ip4));
prefix.pad[0] = prefix.pad[1] = prefix.pad[2] = 0;
}
else
{
memcpy(&prefix.ip6, mp->ip_prefix, sizeof(prefix.ip6));
}
if (mp->is_del) {
u32 vip_index;
if (!(rv = kp_vip_find_index(&prefix, prefix_length, &vip_index)))
rv = kp_vip_del(vip_index);
} else {
u32 vip_index;
kp_vip_type_t type;
if (mp->is_ipv6 == 0) {
type = mp->is_nat4?KP_VIP_TYPE_IP4_NAT44:KP_VIP_TYPE_IP4_NAT46;
} else {
type = mp->is_nat4?KP_VIP_TYPE_IP6_NAT64:KP_VIP_TYPE_IP6_NAT66;
}
rv = kp_vip_add(&prefix, prefix_length, type,
ntohl(mp->new_flows_table_length), &vip_index,
ntohs(mp->port), ntohs(mp->target_port),
ntohs(mp->node_port));
}
REPLY_MACRO (VL_API_KP_CONF_REPLY);
}
static void *vl_api_kp_add_del_vip_t_print
(vl_api_kp_add_del_vip_t *mp, void * handle)
{
u8 * s;
s = format (0, "SCRIPT: kp_add_del_vip ");
s = format (s, "%U ", format_ip46_prefix,
(ip46_address_t *)mp->ip_prefix, mp->prefix_length, IP46_TYPE_ANY);
s = format (s, "port %u ", mp->port);
s = format (s, "target_port %u ", mp->target_port);
s = format (s, "node_port %u ", mp->node_port);
s = format (s, "%s ", mp->is_nat4?"nat4":"nat6");
s = format (s, "%u ", mp->new_flows_table_length);
s = format (s, "%s ", mp->is_del?"del":"add");
FINISH;
}
static void
vl_api_kp_add_del_pod_t_handler
(vl_api_kp_add_del_pod_t * mp)
{
kp_main_t *kpm = &kp_main;
vl_api_kp_conf_reply_t * rmp;
int rv = 0;
u32 vip_index;
ip46_address_t vip_ip_prefix;
u8 vip_prefix_length = mp->vip_prefix_length;
if (mp->vip_is_ipv6 == 0)
{
vip_prefix_length += 96;
memcpy(&vip_ip_prefix.ip4, mp->vip_ip_prefix,
sizeof(vip_ip_prefix.ip4));
vip_ip_prefix.pad[0] = vip_ip_prefix.pad[1] = vip_ip_prefix.pad[2] = 0;
}
else
{
memcpy(&vip_ip_prefix.ip6, mp->vip_ip_prefix,
sizeof(vip_ip_prefix.ip6));
}
ip46_address_t pod_address;
if (mp->pod_is_ipv6 == 0)
{
memcpy(&pod_address.ip4, mp->pod_address,
sizeof(pod_address.ip4));
pod_address.pad[0] = pod_address.pad[1] = pod_address.pad[2] = 0;
}
else
{
memcpy(&pod_address.ip6, mp->pod_address,
sizeof(pod_address.ip6));
}
if ((rv = kp_vip_find_index(&vip_ip_prefix, vip_prefix_length, &vip_index)))
goto done;
if (mp->is_del)
rv = kp_vip_del_pods(vip_index, &pod_address, 1);
else
rv = kp_vip_add_pods(vip_index, &pod_address, 1);
done:
REPLY_MACRO (VL_API_KP_CONF_REPLY);
}
static void *vl_api_kp_add_del_pod_t_print
(vl_api_kp_add_del_pod_t *mp, void * handle)
{
u8 * s;
s = format (0, "SCRIPT: kp_add_del_pod ");
s = format (s, "%U ", format_ip46_prefix,
(ip46_address_t *)mp->vip_ip_prefix, mp->vip_prefix_length, IP46_TYPE_ANY);
s = format (s, "%U ", format_ip46_address,
(ip46_address_t *)mp->pod_address, IP46_TYPE_ANY);
s = format (s, "%s ", mp->is_del?"del":"add");
FINISH;
}
/* List of message types that this plugin understands */
#define foreach_kp_plugin_api_msg \
_(KP_CONF, kp_conf) \
_(KP_ADD_DEL_VIP, kp_add_del_vip) \
_(KP_ADD_DEL_POD, kp_add_del_pod)
static clib_error_t * kp_api_init (vlib_main_t * vm)
{
kp_main_t *kpm = &kp_main;
u8 *name = format (0, "kp_%08x%c", api_version, 0);
kpm->msg_id_base = vl_msg_api_get_msg_ids
((char *) name, VL_MSG_FIRST_AVAILABLE);
#define _(N,n) \
vl_msg_api_set_handlers((VL_API_##N + kpm->msg_id_base), \
#n, \
vl_api_##n##_t_handler, \
vl_noop_handler, \
vl_api_##n##_t_endian, \
vl_api_##n##_t_print, \
sizeof(vl_api_##n##_t), 1);
foreach_kp_plugin_api_msg;
#undef _
/* Add our API messages to the global name_crc hash table */
setup_message_id_table (kpm, &api_main);
return 0;
}
VLIB_INIT_FUNCTION (kp_api_init);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,105 +0,0 @@
# Kube-proxy plugin for VPP {#kp_plugin_doc}
## Overview
This plugin provides kube-proxy data plane on user space,
which is used to replace linux kernal's kube-proxy based on iptables.
The idea is largely inspired from VPP LB plugin.
Currently, kube-proxy plugin supports three service types:
1) Cluster IP plus Port: support any protocols, including TCP, UDP.
2) Node IP plus Node Port: currently only support UDP.
3) External Load Balancer.
For Cluster IP plus Port case:
kube-proxy is configured with a set of Virtual IPs (VIP, which can be
prefixes), and for each VIP, with a set of POD addresses (PODs).
For a specific session received for a given VIP (or VIP prefix),
first packet selects a Pod according to internal load balancing algorithm,
then does DNAT operation and sent to chosen Pod.
At the same time, will create a session entry to store Pod chosen result.
Following packets for that session will look up session table first,
which ensures that a given session will always be routed to the same Pod.
For returned packet from Pod, it will do SNAT operation and sent out.
Please refer to below for details:
https://schd.ws/hosted_files/ossna2017/1e/VPP_K8S_GTPU_OSSNA.pdf
## Configuration
### Global KP parameters
The kube-proxy needs to be configured with some parameters:
ku conf [buckets <n>] [timeout <s>]
buckets: the *per-thread* established-connections-table number of buckets.
timeout: the number of seconds a connection will remain in the
established-connections-table while no packet for this flow
is received.
### Configure VIPs and Ports
ku vip <prefix> port <n> target_port <n> node_port <n> \
[nat4|nat6)] [new_len <n>] [del]
new_len is the size of the new-connection-table. It should be 1 or 2 orders of
magnitude bigger than the number of PODs for the VIP in order to ensure a good
load balancing.
Examples:
ku vip 90.0.0.0/8 nat44 new_len 2048
ku vip 2003::/16 nat66 new_len 2048
### Configure PODs (for each VIP)
ku pod <vip-prefix> [<address> [<address> [...]]] [del]
You can add (or delete) as many PODs at a time (for a single VIP).
Examples:
ku pod 90.0.0.0/8 10.0.0.1
ku pod 2002::/16 2001::2 2001::3 2001::4
### Configure SNAT
ku set interface nat4 in <intfc> [del]
Set SNAT feature in a specific interface.
## Monitoring
The plugin provides quite a bunch of counters and information.
show ku
show ku vip verbose
show node counters
## Design notes
### Multi-Threading
This implementation implement parallelism by using
one established-connections table per thread. This is equivalent to assuming
that RSS will make a job similar to ECMP, and is pretty useful as threads don't
need to get a lock in order to write in the table.
### Hash Table
A kube-proxy requires an efficient read and write Hash table. The Hash table
used by ip6-forward is very read-efficient, but not so much for writing. In
addition, it is not a big deal if writing into the Hash table fails.
The plugin therefore uses a very specific Hash table.
- Fixed (and power of 2) number of buckets (configured at runtime)
- Fixed (and power of 2) elements per buckets (configured at compilation time)

View File

@ -1,268 +0,0 @@
/*
* Copyright (c) 2016 Intel and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "POD IS" BPODIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vat/vat.h>
#include <vlibapi/api.h>
#include <vlibmemory/api.h>
#include <vppinfra/error.h>
#include <kubeproxy/kp.h>
#define __plugin_msg_base kp_test_main.msg_id_base
#include <vlibapi/vat_helper_macros.h>
//TODO: Move that to vat/plugin_api.c
//////////////////////////
uword unformat_ip46_address (unformat_input_t * input, va_list * args)
{
ip46_address_t *ip46 = va_arg (*args, ip46_address_t *);
ip46_type_t type = va_arg (*args, ip46_type_t);
if ((type != IP46_TYPE_IP6) &&
unformat(input, "%U", unformat_ip4_address, &ip46->ip4)) {
ip46_address_mask_ip4(ip46);
return 1;
} else if ((type != IP46_TYPE_IP4) &&
unformat(input, "%U", unformat_ip6_address, &ip46->ip6)) {
return 1;
}
return 0;
}
uword unformat_ip46_prefix (unformat_input_t * input, va_list * args)
{
ip46_address_t *ip46 = va_arg (*args, ip46_address_t *);
u8 *len = va_arg (*args, u8 *);
ip46_type_t type = va_arg (*args, ip46_type_t);
u32 l;
if ((type != IP46_TYPE_IP6) && unformat(input, "%U/%u", unformat_ip4_address, &ip46->ip4, &l)) {
if (l > 32)
return 0;
*len = l + 96;
ip46->pad[0] = ip46->pad[1] = ip46->pad[2] = 0;
} else if ((type != IP46_TYPE_IP4) && unformat(input, "%U/%u", unformat_ip6_address, &ip46->ip6, &l)) {
if (l > 128)
return 0;
*len = l;
} else {
return 0;
}
return 1;
}
/////////////////////////
#define vl_msg_id(n,h) n,
typedef enum {
#include <kubeproxy/kp.api.h>
/* We'll want to know how many messages IDs we need... */
VL_MSG_FIRST_AVAILABLE,
} vl_msg_id_t;
#undef vl_msg_id
/* define message structures */
#define vl_typedefs
#include <kubeproxy/kp.api.h>
#undef vl_typedefs
/* declare message handlers for each api */
#define vl_endianfun /* define message structures */
#include <kubeproxy/kp.api.h>
#undef vl_endianfun
/* instantiate all the print functions we know about */
#define vl_print(handle, ...)
#define vl_printfun
#include <kubeproxy/kp.api.h>
#undef vl_printfun
/* Get the API version number. */
#define vl_api_version(n,v) static u32 api_version=(v);
#include <kubeproxy/kp.api.h>
#undef vl_api_version
typedef struct {
/* API message ID base */
u16 msg_id_base;
vat_main_t *vat_main;
} kp_test_main_t;
kp_test_main_t kp_test_main;
#define foreach_standard_reply_retval_handler \
_(kp_conf_reply) \
_(kp_add_del_vip_reply) \
_(kp_add_del_pod_reply)
#define _(n) \
static void vl_api_##n##_t_handler \
(vl_api_##n##_t * mp) \
{ \
vat_main_t * vam = kp_test_main.vat_main; \
i32 retval = ntohl(mp->retval); \
if (vam->async_mode) { \
vam->async_errors += (retval < 0); \
} else { \
vam->retval = retval; \
vam->result_ready = 1; \
} \
}
foreach_standard_reply_retval_handler;
#undef _
/*
* Table of message reply handlers, must include boilerplate handlers
* we just generated
*/
#define foreach_vpe_api_reply_msg \
_(KP_CONF_REPLY, kp_conf_reply) \
_(KP_ADD_DEL_VIP_REPLY, kp_add_del_vip_reply) \
_(KP_ADD_DEL_POD_REPLY, kp_add_del_pod_reply)
static int api_kp_conf (vat_main_t * vam)
{
unformat_input_t *i = vam->input;
vl_api_kp_conf_t mps, *mp;
int ret;
if (!unformat(i, "%u %u",
&mps.sticky_buckets_per_core,
&mps.flow_timeout)) {
errmsg ("invalid arguments\n");
return -99;
}
M(KP_CONF, mp);
S(mp);
W (ret);
return ret;
}
static int api_kp_add_del_vip (vat_main_t * vam)
{
unformat_input_t * i = vam->input;
vl_api_kp_add_del_vip_t mps, *mp;
int ret;
mps.is_del = 0;
mps.is_nat4 = 0;
if (!unformat(i, "%U",
unformat_ip46_prefix, mps.ip_prefix, &mps.prefix_length, IP46_TYPE_ANY)) {
errmsg ("invalid prefix\n");
return -99;
}
if (unformat(i, "nat4")) {
mps.is_nat4 = 1;
} else if (unformat(i, "nat6")) {
mps.is_nat4 = 0;
} else {
errmsg ("no nat\n");
return -99;
}
if (!unformat(i, "%d", &mps.new_flows_table_length)) {
errmsg ("no table lentgh\n");
return -99;
}
if (unformat(i, "del")) {
mps.is_del = 1;
}
M(KP_ADD_DEL_VIP, mp);
S(mp);
W (ret);
return ret;
}
static int api_kp_add_del_pod (vat_main_t * vam)
{
unformat_input_t * i = vam->input;
vl_api_kp_add_del_pod_t mps, *mp;
int ret;
mps.is_del = 0;
if (!unformat(i, "%U %U",
unformat_ip46_prefix, mps.vip_ip_prefix, &mps.vip_prefix_length, IP46_TYPE_ANY,
unformat_ip46_address, mps.pod_address)) {
errmsg ("invalid prefix or address\n");
return -99;
}
if (unformat(i, "del")) {
mps.is_del = 1;
}
M(KP_ADD_DEL_POD, mp);
S(mp);
W (ret);
return ret;
}
/*
* List of messages that the api test plugin sends,
* and that the data plane plugin processes
*/
#define foreach_vpe_api_msg \
_(kp_conf, "<sticky_buckets_per_core> <flow_timeout>") \
_(kp_add_del_vip, "<ip-prefix> <port> <target_port> <node_port> " \
"[nat4|nat6] <new_table_len> [del]") \
_(kp_add_del_pod, "<vip-ip-prefix> <address> [del]")
static void
kp_vat_api_hookup (vat_main_t *vam)
{
kp_test_main_t * kptm = &kp_test_main;
/* Hook up handlers for replies from the data plane plug-in */
#define _(N,n) \
vl_msg_api_set_handlers((VL_API_##N + kptm->msg_id_base), \
#n, \
vl_api_##n##_t_handler, \
vl_noop_handler, \
vl_api_##n##_t_endian, \
vl_api_##n##_t_print, \
sizeof(vl_api_##n##_t), 1);
foreach_vpe_api_reply_msg;
#undef _
/* API messages we can send */
#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n);
foreach_vpe_api_msg;
#undef _
/* Help strings */
#define _(n,h) hash_set_mem (vam->help_by_name, #n, h);
foreach_vpe_api_msg;
#undef _
}
clib_error_t * vat_plugin_register (vat_main_t *vam)
{
kp_test_main_t * kptm = &kp_test_main;
u8 * name;
kptm->vat_main = vam;
/* Ask the vpp engine for the first assigned message-id */
name = format (0, "kp_%08x%c", api_version, 0);
kptm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name);
if (kptm->msg_id_base != (u16) ~0)
kp_vat_api_hookup (vam);
vec_free(name);
return 0;
}

View File

@ -1,216 +0,0 @@
/*
* Copyright (c) 2017 Intel and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* vppinfra already includes tons of different hash tables.
* MagLev flow table is a bit different. It has to be very efficient
* for both writing and reading operations. But it does not need to
* be 100% reliable (write can fail). It also needs to recycle
* old entries in a lazy way.
*
* This hash table is the most dummy hash table you can do.
* Fixed total size, fixed bucket size.
* Advantage is that it could be very efficient (maybe).
*
*/
#ifndef KP_PLUGIN_KP_KPHASH_H_
#define KP_PLUGIN_KP_KPHASH_H_
#include <vnet/vnet.h>
#include <vppinfra/xxhash.h>
#include <vppinfra/crc32.h>
/*
* @brief Number of entries per bucket.
*/
#define KPHASH_ENTRY_PER_BUCKET 4
#define KP_HASH_DO_NOT_USE_SSE_BUCKETS 0
/**
* 32 bits integer comparison for running values.
* 1 > 0 is true. But 1 > 0xffffffff also is.
*/
#define clib_u32_loop_gt(a, b) (((u32)(a)) - ((u32)(b)) < 0x7fffffff)
/*
* @brief One bucket contains 4 entries.
* Each bucket takes one 64B cache line in memory.
*/
typedef struct {
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
u32 hash[KPHASH_ENTRY_PER_BUCKET];
u32 timeout[KPHASH_ENTRY_PER_BUCKET];
u32 vip[KPHASH_ENTRY_PER_BUCKET];
u32 value[KPHASH_ENTRY_PER_BUCKET];
} kp_hash_bucket_t;
typedef struct {
u32 buckets_mask;
u32 timeout;
kp_hash_bucket_t buckets[];
} kp_hash_t;
#define kp_hash_nbuckets(h) (((h)->buckets_mask) + 1)
#define kp_hash_size(h) ((h)->buckets_mask + KPHASH_ENTRY_PER_BUCKET)
#define kp_hash_foreach_bucket(h, bucket) \
for (bucket = (h)->buckets; \
bucket < (h)->buckets + kp_hash_nbuckets(h); \
bucket++)
#define kp_hash_foreach_entry(h, bucket, i) \
kp_hash_foreach_bucket(h, bucket) \
for (i = 0; i < KPHASH_ENTRY_PER_BUCKET; i++)
#define kp_hash_foreach_valid_entry(h, bucket, i, now) \
kp_hash_foreach_entry(h, bucket, i) \
if (!clib_u32_loop_gt((now), bucket->timeout[i]))
static_always_inline
kp_hash_t *kp_hash_alloc(u32 buckets, u32 timeout)
{
if (!is_pow2(buckets))
return NULL;
// Allocate 1 more bucket for prefetch
u32 size = ((u64)&((kp_hash_t *)(0))->buckets[0]) +
sizeof(kp_hash_bucket_t) * (buckets + 1);
u8 *mem = 0;
kp_hash_t *h;
vec_alloc_aligned(mem, size, CLIB_CACHE_LINE_BYTES);
h = (kp_hash_t *)mem;
h->buckets_mask = (buckets - 1);
h->timeout = timeout;
return h;
}
static_always_inline
void kp_hash_free(kp_hash_t *h)
{
u8 *mem = (u8 *)h;
vec_free(mem);
}
static_always_inline
u32 kp_hash_hash(u64 k0, u64 k1, u64 k2, u64 k3, u64 k4)
{
#ifdef clib_crc32c_uses_intrinsics
u64 key[5];
key[0] = k0;
key[1] = k1;
key[2] = k2;
key[3] = k3;
key[4] = k4;
return clib_crc32c ((u8 *) key, 40);
#else
u64 tmp = k0 ^ k1 ^ k2 ^ k3 ^ k4;
return (u32)clib_xxhash (tmp);
#endif
}
static_always_inline
void kp_hash_prefetch_bucket(kp_hash_t *ht, u32 hash)
{
kp_hash_bucket_t *bucket = &ht->buckets[hash & ht->buckets_mask];
CLIB_PREFETCH(bucket, sizeof(*bucket), READ);
}
static_always_inline
void kp_hash_get(kp_hash_t *ht, u32 hash, u32 vip, u32 time_now,
u32 *available_index, u32 *found_value)
{
kp_hash_bucket_t *bucket = &ht->buckets[hash & ht->buckets_mask];
*found_value = ~0;
*available_index = ~0;
#if __SSE4_2__ && KP_HASH_DO_NOT_USE_SSE_BUCKETS == 0
u32 bitmask, found_index;
__m128i mask;
// mask[*] = timeout[*] > now
mask = _mm_cmpgt_epi32(_mm_loadu_si128 ((__m128i *) bucket->timeout),
_mm_set1_epi32 (time_now));
// bitmask[*] = now <= timeout[*/4]
bitmask = (~_mm_movemask_epi8(mask)) & 0xffff;
// Get first index with now <= timeout[*], if any.
*available_index = (bitmask)?__builtin_ctz(bitmask)/4:*available_index;
// mask[*] = (timeout[*] > now) && (hash[*] == hash)
mask = _mm_and_si128(mask,
_mm_cmpeq_epi32(
_mm_loadu_si128 ((__m128i *) bucket->hash),
_mm_set1_epi32 (hash)));
// Load the array of vip values
// mask[*] = (timeout[*] > now) && (hash[*] == hash) && (vip[*] == vip)
mask = _mm_and_si128(mask,
_mm_cmpeq_epi32(
_mm_loadu_si128 ((__m128i *) bucket->vip),
_mm_set1_epi32 (vip)));
// mask[*] = (timeout[*x4] > now) && (hash[*x4] == hash) && (vip[*x4] == vip)
bitmask = _mm_movemask_epi8(mask);
// Get first index, if any
found_index = (bitmask)?__builtin_ctzll(bitmask)/4:0;
ASSERT(found_index < 4);
*found_value = (bitmask)?bucket->value[found_index]:*found_value;
bucket->timeout[found_index] =
(bitmask)?time_now + ht->timeout:bucket->timeout[found_index];
#else
u32 i;
for (i = 0; i < KPHASH_ENTRY_PER_BUCKET; i++) {
u8 cmp = (bucket->hash[i] == hash && bucket->vip[i] == vip);
u8 timeouted = clib_u32_loop_gt(time_now, bucket->timeout[i]);
*found_value = (cmp || timeouted)?*found_value:bucket->value[i];
bucket->timeout[i] = (cmp || timeouted)?time_now + ht->timeout:bucket->timeout[i];
*available_index = (timeouted && (*available_index == ~0))?i:*available_index;
if (!cmp)
return;
}
#endif
}
static_always_inline
u32 kp_hash_available_value(kp_hash_t *h, u32 hash, u32 available_index)
{
return h->buckets[hash & h->buckets_mask].value[available_index];
}
static_always_inline
void kp_hash_put(kp_hash_t *h, u32 hash, u32 value, u32 vip,
u32 available_index, u32 time_now)
{
kp_hash_bucket_t *bucket = &h->buckets[hash & h->buckets_mask];
bucket->hash[available_index] = hash;
bucket->value[available_index] = value;
bucket->timeout[available_index] = time_now + h->timeout;
bucket->vip[available_index] = vip;
}
static_always_inline
u32 kp_hash_elts(kp_hash_t *h, u32 time_now)
{
u32 tot = 0;
kp_hash_bucket_t *bucket;
u32 i;
kp_hash_foreach_valid_entry(h, bucket, i, time_now) {
tot++;
}
return tot;
}
#endif /* KP_PLUGIN_KP_KPHASH_H_ */

View File

@ -107,33 +107,52 @@ vl_api_lb_add_del_vip_t_handler
lb_main_t *lbm = &lb_main;
vl_api_lb_conf_reply_t * rmp;
int rv = 0;
ip46_address_t prefix;
memcpy(&prefix.ip6, mp->ip_prefix, sizeof(prefix.ip6));
lb_vip_add_args_t args;
memcpy (&(args.prefix.ip6), mp->ip_prefix, sizeof(args.prefix.ip6));
if (mp->is_del) {
u32 vip_index;
if (!(rv = lb_vip_find_index(&prefix, mp->prefix_length, &vip_index)))
if (!(rv = lb_vip_find_index(&(args.prefix), mp->prefix_length, &vip_index)))
rv = lb_vip_del(vip_index);
} else {
u32 vip_index;
lb_vip_type_t type = 0;
if (ip46_prefix_is_ip4(&prefix, mp->prefix_length)) {
if (ip46_prefix_is_ip4(&(args.prefix), mp->prefix_length)) {
if (mp->encap == LB_ENCAP_TYPE_GRE4)
type = LB_VIP_TYPE_IP4_GRE4;
else if (mp->encap == LB_ENCAP_TYPE_GRE6)
type = LB_VIP_TYPE_IP4_GRE6;
else if (mp->encap == LB_ENCAP_TYPE_L3DSR)
type = LB_VIP_TYPE_IP4_L3DSR;
else if (mp->encap == LB_ENCAP_TYPE_NAT4)
type = LB_VIP_TYPE_IP4_NAT4;
} else {
if (mp->encap == LB_ENCAP_TYPE_GRE4)
type = LB_VIP_TYPE_IP6_GRE4;
else if (mp->encap == LB_ENCAP_TYPE_GRE6)
type = LB_VIP_TYPE_IP6_GRE6;
else if (mp->encap == LB_ENCAP_TYPE_NAT6)
type = LB_VIP_TYPE_IP6_NAT6;
}
rv = lb_vip_add(&prefix, mp->prefix_length, type, mp->dscp,
mp->new_flows_table_length, &vip_index);
args.plen = mp->prefix_length;
args.type = type;
args.new_length = mp->new_flows_table_length;
if (mp->encap == LB_ENCAP_TYPE_L3DSR) {
args.encap_args.dscp = (u8)(mp->dscp & 0x3F);
}
else if ((mp->encap == LB_ENCAP_TYPE_NAT4)
||(mp->encap == LB_ENCAP_TYPE_NAT6)) {
args.encap_args.srv_type = mp->type;
args.encap_args.port = ntohs(mp->port);
args.encap_args.target_port = ntohs(mp->target_port);
args.encap_args.node_port = ntohs(mp->node_port);
}
rv = lb_vip_add(args, &vip_index);
}
REPLY_MACRO (VL_API_LB_CONF_REPLY);
}
@ -146,8 +165,26 @@ static void *vl_api_lb_add_del_vip_t_print
s = format (s, "%U ", format_ip46_prefix,
(ip46_address_t *)mp->ip_prefix, mp->prefix_length, IP46_TYPE_ANY);
s = format (s, "%s ", (mp->encap==LB_ENCAP_TYPE_GRE4)?
"gre4":(mp->encap==LB_ENCAP_TYPE_GRE6)?"gre6":"l3dsr");
s = format (s, "%s ", (mp->encap == LB_ENCAP_TYPE_GRE4)? "gre4"
: (mp->encap == LB_ENCAP_TYPE_GRE6)? "gre6"
: (mp->encap == LB_ENCAP_TYPE_NAT4)? "nat4"
: (mp->encap == LB_ENCAP_TYPE_NAT6)? "nat6"
: "l3dsr");
if (mp->encap==LB_ENCAP_TYPE_L3DSR)
{
s = format (s, "dscp %u ", mp->dscp);
}
if ((mp->encap==LB_ENCAP_TYPE_NAT4)
|| (mp->encap==LB_ENCAP_TYPE_NAT6))
{
s = format (s, "type %u ", mp->type);
s = format (s, "port %u ", mp->port);
s = format (s, "target_port %u ", mp->target_port);
s = format (s, "node_port %u ", mp->node_port);
}
s = format (s, "%u ", mp->new_flows_table_length);
s = format (s, "%s ", mp->is_del?"del":"add");
FINISH;
@ -161,14 +198,23 @@ vl_api_lb_add_del_as_t_handler
vl_api_lb_conf_reply_t * rmp;
int rv = 0;
u32 vip_index;
if ((rv = lb_vip_find_index((ip46_address_t *)mp->vip_ip_prefix,
mp->vip_prefix_length, &vip_index)))
ip46_address_t vip_ip_prefix;
memcpy(&vip_ip_prefix.ip6, mp->vip_ip_prefix,
sizeof(vip_ip_prefix.ip6));
ip46_address_t as_address;
memcpy(&as_address.ip6, mp->as_address,
sizeof(as_address.ip6));
if ((rv = lb_vip_find_index(&vip_ip_prefix, mp->vip_prefix_length, &vip_index)))
goto done;
if (mp->is_del)
rv = lb_vip_del_ass(vip_index, (ip46_address_t *)mp->as_address, 1);
rv = lb_vip_del_ass(vip_index, &as_address, 1);
else
rv = lb_vip_add_ass(vip_index, (ip46_address_t *)mp->as_address, 1);
rv = lb_vip_add_ass(vip_index, &as_address, 1);
done:
REPLY_MACRO (VL_API_LB_CONF_REPLY);

View File

@ -21,20 +21,24 @@ lb_vip_command_fn (vlib_main_t * vm,
unformat_input_t * input, vlib_cli_command_t * cmd)
{
unformat_input_t _line_input, *line_input = &_line_input;
ip46_address_t prefix;
u8 plen;
u32 new_len = 1024;
lb_vip_add_args_t args;
u8 del = 0;
int ret;
u32 encap = 0;
u32 dscp = ~0;
lb_vip_type_t type = 0;
u32 srv_type = LB_SRV_TYPE_CLUSTERIP;
u32 port = 0;
u32 target_port = 0;
u32 node_port = 0;
clib_error_t *error = 0;
args.new_length = 1024;
if (!unformat_user (input, unformat_line_input, line_input))
return 0;
if (!unformat(line_input, "%U", unformat_ip46_prefix, &prefix, &plen, IP46_TYPE_ANY)) {
if (!unformat(line_input, "%U", unformat_ip46_prefix, &(args.prefix),
&(args.plen), IP46_TYPE_ANY, &(args.plen))) {
error = clib_error_return (0, "invalid vip prefix: '%U'",
format_unformat_error, line_input);
goto done;
@ -42,7 +46,7 @@ lb_vip_command_fn (vlib_main_t * vm,
while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
{
if (unformat(line_input, "new_len %d", &new_len))
if (unformat(line_input, "new_len %d", &(args.new_length)))
;
else if (unformat(line_input, "del"))
del = 1;
@ -52,8 +56,22 @@ lb_vip_command_fn (vlib_main_t * vm,
encap = LB_ENCAP_TYPE_GRE6;
else if (unformat(line_input, "encap l3dsr"))
encap = LB_ENCAP_TYPE_L3DSR;
else if (unformat(line_input, "encap nat4"))
encap = LB_ENCAP_TYPE_NAT4;
else if (unformat(line_input, "encap nat6"))
encap = LB_ENCAP_TYPE_NAT6;
else if (unformat(line_input, "dscp %d", &dscp))
;
else if (unformat(line_input, "type clusterip"))
srv_type = LB_SRV_TYPE_CLUSTERIP;
else if (unformat(line_input, "type nodeport"))
srv_type = LB_SRV_TYPE_NODEPORT;
else if (unformat(line_input, "port %d", &port))
;
else if (unformat(line_input, "target_port %d", &target_port))
;
else if (unformat(line_input, "node_port %d", &node_port))
;
else {
error = clib_error_return (0, "parse error: '%U'",
format_unformat_error, line_input);
@ -75,32 +93,61 @@ lb_vip_command_fn (vlib_main_t * vm,
goto done;
}
if (ip46_prefix_is_ip4(&prefix, plen)) {
if (ip46_prefix_is_ip4(&(args.prefix), (args.plen)))
{
if (encap == LB_ENCAP_TYPE_GRE4)
type = LB_VIP_TYPE_IP4_GRE4;
args.type = LB_VIP_TYPE_IP4_GRE4;
else if (encap == LB_ENCAP_TYPE_GRE6)
type = LB_VIP_TYPE_IP4_GRE6;
args.type = LB_VIP_TYPE_IP4_GRE6;
else if (encap == LB_ENCAP_TYPE_L3DSR)
type = LB_VIP_TYPE_IP4_L3DSR;
} else {
args.type = LB_VIP_TYPE_IP4_L3DSR;
else if (encap == LB_ENCAP_TYPE_NAT4)
args.type = LB_VIP_TYPE_IP4_NAT4;
else if (encap == LB_ENCAP_TYPE_NAT6)
{
error = clib_error_return(0, "currently does not support NAT46");
goto done;
}
}
else
{
if (encap == LB_ENCAP_TYPE_GRE4)
type = LB_VIP_TYPE_IP6_GRE4;
args.type = LB_VIP_TYPE_IP6_GRE4;
else if (encap == LB_ENCAP_TYPE_GRE6)
type = LB_VIP_TYPE_IP6_GRE6;
args.type = LB_VIP_TYPE_IP6_GRE6;
else if (encap == LB_ENCAP_TYPE_NAT6)
args.type = LB_VIP_TYPE_IP6_NAT6;
else if (encap == LB_ENCAP_TYPE_NAT4)
{
error = clib_error_return(0, "currently does not support NAT64");
goto done;
}
}
lb_garbage_collection();
u32 index;
if (!del) {
if ((ret = lb_vip_add(&prefix, plen, type, (u8)(dscp & 0x3F), new_len, &index))) {
if (encap == LB_ENCAP_TYPE_L3DSR) {
args.encap_args.dscp = (u8)(dscp & 0x3F);
}
else if ((encap == LB_ENCAP_TYPE_NAT4)
|| (encap == LB_ENCAP_TYPE_NAT6))
{
args.encap_args.srv_type = (u8) srv_type;
args.encap_args.port = (u16) port;
args.encap_args.target_port = (u16) target_port;
args.encap_args.node_port = (u16) node_port;
}
if ((ret = lb_vip_add(args, &index))) {
error = clib_error_return (0, "lb_vip_add error %d", ret);
goto done;
} else {
vlib_cli_output(vm, "lb_vip_add ok %d", index);
}
} else {
if ((ret = lb_vip_find_index(&prefix, plen, &index))) {
if ((ret = lb_vip_find_index(&(args.prefix), args.plen, &index))) {
error = clib_error_return (0, "lb_vip_find_index error %d", ret);
goto done;
} else if ((ret = lb_vip_del(index))) {
@ -118,7 +165,10 @@ done:
VLIB_CLI_COMMAND (lb_vip_command, static) =
{
.path = "lb vip",
.short_help = "lb vip <prefix> [encap (gre6|gre4|l3dsr)] [dscp <n>] [new_len <n>] [del]",
.short_help = "lb vip <prefix> [encap (gre6|gre4|l3dsr|nat4|nat6)] "
"[dscp <n>] "
"[type (nodeport|clusterip) port <n> target_port <n> node_port <n>] "
"[new_len <n>] [del]",
.function = lb_vip_command_fn,
};
@ -300,6 +350,99 @@ VLIB_CLI_COMMAND (lb_show_vips_command, static) =
.function = lb_show_vips_command_fn,
};
static clib_error_t *
lb_set_interface_nat_command_fn (vlib_main_t * vm,
unformat_input_t * input,
vlib_cli_command_t * cmd,
u8 is_nat6)
{
unformat_input_t _line_input, *line_input = &_line_input;
vnet_main_t * vnm = vnet_get_main();
clib_error_t * error = 0;
u32 * sw_if_index = 0;
u32 * inside_sw_if_indices = 0;
int is_del = 0;
/* Get a line of input. */
if (!unformat_user (input, unformat_line_input, line_input))
return 0;
while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
{
if (unformat (line_input, "in %U", unformat_vnet_sw_interface,
vnm, sw_if_index))
vec_add1 (inside_sw_if_indices, *sw_if_index);
else if (unformat (line_input, "del"))
is_del = 1;
else
{
error = clib_error_return (0, "unknown input '%U'",
format_unformat_error, line_input);
goto done;
}
}
vec_foreach (sw_if_index, inside_sw_if_indices)
{
if (!is_nat6)
{
if (lb_nat4_interface_add_del (*sw_if_index, is_del))
{
error = clib_error_return(
0, "%s %U failed", is_del ? "del" : "add",
format_vnet_sw_interface_name, vnm,
vnet_get_sw_interface (vnm, *sw_if_index));
goto done;
}
}
else
{
if (lb_nat6_interface_add_del (*sw_if_index, is_del))
{
error = clib_error_return(
0, "%s %U failed", is_del ? "del" : "add",
format_vnet_sw_interface_name, vnm,
vnet_get_sw_interface (vnm, *sw_if_index));
goto done;
}
}
}
done:
unformat_free (line_input);
vec_free (inside_sw_if_indices);
return error;
}
static clib_error_t *
lb_set_interface_nat4_command_fn (vlib_main_t * vm,
unformat_input_t * input,
vlib_cli_command_t * cmd)
{
return lb_set_interface_nat_command_fn(vm, input, cmd, 0);
}
VLIB_CLI_COMMAND (lb_set_interface_nat4_command, static) = {
.path = "lb set interface nat4",
.function = lb_set_interface_nat4_command_fn,
.short_help = "lb set interface nat4 in <intfc> [del]",
};
static clib_error_t *
lb_set_interface_nat6_command_fn (vlib_main_t * vm,
unformat_input_t * input,
vlib_cli_command_t * cmd)
{
return lb_set_interface_nat_command_fn(vm, input, cmd, 1);
}
VLIB_CLI_COMMAND (lb_set_interface_nat6_command, static) = {
.path = "lb set interface nat6",
.function = lb_set_interface_nat6_command_fn,
.short_help = "lb set interface nat6 in <intfc> [del]",
};
static clib_error_t *
lb_flowtable_flush_command_fn (vlib_main_t * vm,
unformat_input_t * input, vlib_cli_command_t * cmd)

View File

@ -3,9 +3,9 @@ option version = "1.0.0";
/** \brief Configure Load-Balancer global parameters
@param client_index - opaque cookie to identify the sender
@param context - sender context, to match reply w/ request
@param ip4_src_address - IPv4 address to be used as source for IPv4 GRE traffic.
@param ip6_src_address - IPv6 address to be used as source for IPv6 GRE traffic.
@param n_sticky_buckets - Number of buckets *per worker thread* in the
@param ip4_src_address - IPv4 address to be used as source for IPv4 traffic(applicable in GRE4/GRE6/NAT4/NAT6 mode only).
@param ip6_src_address - IPv6 address to be used as source for IPv6 traffic(applicable in GRE4/GRE6/NAT4/NAT6 mode only).
@param sticky_buckets_per_core - Number of buckets *per worker thread* in the
established flow table (must be power of 2).
@param flow_timeout - Time in seconds after which, if no packet is received
for a given flow, the flow is removed from the established flow table.
@ -25,8 +25,12 @@ autoreply define lb_conf
@param context - sender context, to match reply w/ request
@param ip_prefix - IP address (IPv4 in lower order 32 bits).
@param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4).
@param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2).
@param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2) or NAT4(3) or NAT6(4).
@param dscp - DSCP bit corresponding to VIP(applicable in L3DSR mode only).
@param type - service type(applicable in NAT4/NAT6 mode only).
@param port - service port(applicable in NAT4/NAT6 mode only).
@param target_port - Pod's port corresponding to specific service(applicable in NAT4/NAT6 mode only).
@param node_port - Node's port(applicable in NAT4/NAT6 mode only).
@param new_flows_table_length - Size of the new connections flow table used
for this VIP (must be power of 2).
@param is_del - The VIP should be removed.
@ -38,6 +42,10 @@ autoreply define lb_add_del_vip {
u8 prefix_length;
u8 encap;
u8 dscp;
u8 type;
u16 port;
u16 target_port;
u16 node_port;
u32 new_flows_table_length;
u8 is_del;
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -29,6 +29,32 @@ Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must be using
the same encap. type (i.e. IPv4+GRE or IPv6+GRE or IPv4+L3DSR).
Meaning that for a given VIP, all AS addresses must be of the same family.
3). IPv4/IPv6 + NAT4/NAT6 encap types:
This type provides kube-proxy data plane on user space,
which is used to replace linux kernal's kube-proxy based on iptables.
Currently, load balancer plugin supports three service types:
a) Cluster IP plus Port: support any protocols, including TCP, UDP.
b) Node IP plus Node Port: currently only support UDP.
c) External Load Balancer.
For Cluster IP plus Port case:
kube-proxy is configured with a set of Virtual IPs (VIP, which can be
prefixes), and for each VIP, with a set of AS addresses (ASs).
For a specific session received for a given VIP (or VIP prefix),
first packet selects a AS according to internal load balancing algorithm,
then does DNAT operation and sent to chosen AS.
At the same time, will create a session entry to store AS chosen result.
Following packets for that session will look up session table first,
which ensures that a given session will always be routed to the same AS.
For returned packet from AS, it will do SNAT operation and sent out.
Please refer to below for details:
https://schd.ws/hosted_files/ossna2017/1e/VPP_K8S_GTPU_OSSNA.pdf
## Performances
The load balancer has been tested up to 1 millions flows and still forwards more
@ -45,9 +71,11 @@ The load balancer needs to be configured with some parameters:
lb conf [ip4-src-address <addr>] [ip6-src-address <addr>]
[buckets <n>] [timeout <s>]
ip4-src-address: the source address used to send encap. packets using IPv4.
ip4-src-address: the source address used to send encap. packets using IPv4 for GRE4 mode.
or Node IP4 address for NAT4 mode.
ip6-src-address: the source address used to send encap. packets using IPv6.
ip6-src-address: the source address used to send encap. packets using IPv6 for GRE6 mode.
or Node IP6 address for NAT6 mode.
buckets: the *per-thread* established-connexions-table number of buckets.
@ -57,13 +85,15 @@ timeout: the number of seconds a connection will remain in the
### Configure the VIPs
lb vip <prefix> [encap (gre6|gre4|l3dsr)] [dscp <n>] [new_len <n>] [del]
lb vip <prefix> [encap (gre6|gre4|l3dsr|nat4|nat6)] \
[dscp <n>] [port <n> target_port <n> node_port <n>] [new_len <n>] [del]
new_len is the size of the new-connection-table. It should be 1 or 2 orders of
magnitude bigger than the number of ASs for the VIP in order to ensure a good
load balancing.
Encap l3dsr and dscp is used to map VIP to dscp bit and rewrite DSCP bit in packets.
So the selected server could get VIP from DSCP bit in this packet and perform DSR.
Encap nat4/nat6 and port/target_port/node_port is used to do kube-proxy data plane.
Examples:
@ -72,6 +102,8 @@ Examples:
lb vip 80.0.0.0/8 encap gre6 new_len 16
lb vip 90.0.0.0/8 encap gre4 new_len 1024
lb vip 100.0.0.0/8 encap l3dsr dscp 2 new_len 32
lb vip 90.1.2.1/32 encap nat4 port 3306 target_port 3307 node_port 30964 new_len 1024
lb vip 2004::/16 encap nat6 port 6306 target_port 6307 node_port 30966 new_len 1024
### Configure the ASs (for each VIP)
@ -87,7 +119,17 @@ Examples:
lb as 80.0.0.0/8 2001::2
lb as 90.0.0.0/8 10.0.0.1
### Configure SNAT
lb set interface nat4 in <intfc> [del]
Set SNAT feature in a specific interface.
(applicable in NAT4 mode only)
lb set interface nat6 in <intfc> [del]
Set SNAT feature in a specific interface.
(applicable in NAT6 mode only)
## Monitoring

View File

@ -171,6 +171,10 @@ static int api_lb_add_del_vip (vat_main_t * vam)
mps.encap = LB_ENCAP_TYPE_GRE6;
} else if (unformat(i, "l3dsr")) {
mps.encap = LB_ENCAP_TYPE_L3DSR;
} else if (unformat(i, "nat4")) {
mps.encap = LB_ENCAP_TYPE_NAT4;
} else if (unformat(i, "nat6")) {
mps.encap = LB_ENCAP_TYPE_NAT6;
} else {
errmsg ("no encap\n");
return -99;
@ -221,7 +225,9 @@ static int api_lb_add_del_as (vat_main_t * vam)
*/
#define foreach_vpe_api_msg \
_(lb_conf, "<ip4-src-addr> <ip6-src-address> <sticky_buckets_per_core> <flow_timeout>") \
_(lb_add_del_vip, "<ip-prefix> [gre4|gre6] <new_table_len> [del]") \
_(lb_add_del_vip, "<ip-prefix> [gre4|gre6|l3dsr|nat4|nat6] " \
"<dscp> <port> <target_port> <node_port> " \
"<new_table_len> [del]") \
_(lb_add_del_as, "<vip-ip-prefix> <address> [del]")
static void

File diff suppressed because it is too large Load Diff

View File

@ -1,207 +0,0 @@
import socket
import unittest
from scapy.layers.inet import IP, UDP
from scapy.layers.inet6 import IPv6
from scapy.layers.l2 import Ether
from scapy.packet import Raw
from framework import VppTestCase, running_extended_tests
from util import ppp
""" TestKP is a subclass of VPPTestCase classes.
TestKP class defines Four NAT test case for:
- IP4 to IP4 NAT
- IP4 to IP6 NAT
- IP6 to IP4 NAT
- IP6 to IP6 NAT
"""
class TestKP(VppTestCase):
""" Kube-proxy Test Case """
@classmethod
def setUpClass(cls):
super(TestKP, cls).setUpClass()
cls.pods = range(5)
cls.packets = range(5)
try:
cls.create_pg_interfaces(range(2))
cls.interfaces = list(cls.pg_interfaces)
for i in cls.interfaces:
i.admin_up()
i.config_ip4()
i.config_ip6()
i.disable_ipv6_ra()
i.resolve_arp()
i.resolve_ndp()
dst4 = socket.inet_pton(socket.AF_INET, "10.0.0.0")
dst6 = socket.inet_pton(socket.AF_INET6, "2002::")
cls.vapi.ip_add_del_route(dst4, 24, cls.pg1.remote_ip4n)
cls.vapi.ip_add_del_route(dst6, 16, cls.pg1.remote_ip6n, is_ipv6=1)
except Exception:
super(TestKP, cls).tearDownClass()
raise
def tearDown(self):
super(TestKP, self).tearDown()
if not self.vpp_dead:
self.logger.info(self.vapi.cli("show ku vip verbose"))
def getIPv4Flow(self, id):
return (IP(dst="90.0.%u.%u" % (id / 255, id % 255),
src="40.0.%u.%u" % (id / 255, id % 255)) /
UDP(sport=10000 + id, dport=3306))
def getIPv6Flow(self, id):
return (IPv6(dst="2001::%u" % (id), src="fd00:f00d:ffff::%u" % (id)) /
UDP(sport=10000 + id, dport=3306))
def generatePackets(self, src_if, isv4):
self.reset_packet_infos()
pkts = []
for pktid in self.packets:
info = self.create_packet_info(src_if, self.pg1)
payload = self.info_to_payload(info)
ip = self.getIPv4Flow(pktid) if isv4 else self.getIPv6Flow(pktid)
packet = (Ether(dst=src_if.local_mac, src=src_if.remote_mac) /
ip /
Raw(payload))
self.extend_packet(packet, 128)
info.data = packet.copy()
pkts.append(packet)
return pkts
def checkInner(self, udp):
self.assertEqual(udp.dport, 3307)
def checkCapture(self, nat4, isv4):
self.pg0.assert_nothing_captured()
out = self.pg1.get_capture(len(self.packets))
load = [0] * len(self.pods)
self.info = None
for p in out:
try:
podid = 0
udp = None
if nat4:
ip = p[IP]
podid = int(ip.dst.split(".")[3])
self.assertEqual(ip.version, 4)
self.assertEqual(ip.flags, 0)
self.assertEqual(ip.dst, "10.0.0.%u" % podid)
self.assertEqual(ip.proto, 17)
self.assertEqual(len(ip.options), 0)
self.assertGreaterEqual(ip.ttl, 63)
udp = p[UDP]
else:
ip = p[IPv6]
podid = ip.dst.split(":")
podid = podid[len(podid) - 1]
podid = 0 if podid == "" else int(podid)
self.assertEqual(ip.version, 6)
self.assertEqual(ip.tc, 0)
self.assertEqual(ip.fl, 0)
self.assertEqual(
socket.inet_pton(socket.AF_INET6, ip.dst),
socket.inet_pton(socket.AF_INET6, "2002::%u" % podid)
)
self.assertEqual(ip.nh, 17)
self.assertGreaterEqual(ip.hlim, 63)
udp = UDP(str(p[IPv6].payload))
# self.assertEqual(len(ip.options), 0)
self.checkInner(udp)
load[podid] += 1
except:
self.logger.error(ppp("Unexpected or invalid packet:", p))
raise
# This is just to roughly check that the balancing algorithm
# is not completly biased.
for podid in self.pods:
if load[podid] < len(self.packets) / (len(self.pods) * 2):
self.log(
"Pod isn't balanced: load[%d] = %d" % (podid, load[podid]))
raise Exception("Kube-proxy algorithm is biased")
def test_kp_ip4_nat4(self):
""" Kube-proxy NAT44 """
try:
self.vapi.cli("ku vip 90.0.0.0/8 port 3306 target_port 3307 nat4")
for podid in self.pods:
self.vapi.cli("ku pod 90.0.0.0/8 10.0.0.%u" % (podid))
self.pg0.add_stream(self.generatePackets(self.pg0, isv4=True))
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
self.checkCapture(nat4=True, isv4=True)
finally:
for podid in self.pods:
self.vapi.cli("ku pod 90.0.0.0/8 10.0.0.%u del" % (podid))
self.vapi.cli("ku vip 90.0.0.0/8 nat4 del")
self.vapi.cli("test kube-proxy flowtable flush")
@unittest.skip("this test is broken")
def test_kp_ip6_nat4(self):
""" Kube-proxy NAT64 """
try:
self.vapi.cli("ku vip 90.0.0.0/8 port 3306 target_port 3307 nat4")
for podid in self.pods:
self.vapi.cli("ku pod 2001::/16 10.0.0.%u" % (podid))
self.pg0.add_stream(self.generatePackets(self.pg0, isv4=False))
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
self.checkCapture(nat4=True, isv4=False)
finally:
for podid in self.pods:
self.vapi.cli("ku pod 2001::/16 10.0.0.%u del" % (podid))
self.vapi.cli("ku vip 2001::/16 nat4 del")
self.vapi.cli("test kube-proxy flowtable flush")
@unittest.skip("this test is broken")
def test_kp_ip4_nat6(self):
""" Kube-proxy NAT46 """
try:
self.vapi.cli("ku vip 90.0.0.0/8 port 3306 target_port 3307 nat6")
for podid in self.pods:
self.vapi.cli("ku pod 90.0.0.0/8 2002::%u" % (podid))
self.pg0.add_stream(self.generatePackets(self.pg0, isv4=True))
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
self.checkCapture(nat4=False, isv4=True)
finally:
for podid in self.pods:
self.vapi.cli("ku pod 90.0.0.0/8 2002::%u del" % (podid))
self.vapi.cli("ku vip 90.0.0.0/8 nat6 del")
self.vapi.cli("test kube-proxy flowtable flush")
@unittest.skipUnless(running_extended_tests(), "part of extended tests")
def test_kp_ip6_nat6(self):
""" Kube-proxy NAT66 """
try:
self.vapi.cli("ku vip 2001::/16 port 3306 target_port 3307 nat6")
for podid in self.pods:
self.vapi.cli("ku pod 2001::/16 2002::%u" % (podid))
self.pg0.add_stream(self.generatePackets(self.pg0, isv4=False))
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
self.checkCapture(nat4=False, isv4=False)
finally:
for podid in self.pods:
self.vapi.cli("ku pod 2001::/16 2002::%u del" % (podid))
self.vapi.cli("ku vip 2001::/16 nat6 del")
self.vapi.cli("test kube-proxy flowtable flush")

View File

@ -16,6 +16,8 @@ from util import ppp
- IP6 to GRE4 encap
- IP6 to GRE6 encap
- IP4 to L3DSR encap
- IP4 to NAT4 encap
- IP6 to NAT6 encap
As stated in comments below, GRE has issues with IPv6.
All test cases involving IPv6 are executed, but
@ -135,7 +137,7 @@ class TestLB(VppTestCase):
# self.assertEqual(len(ip.options), 0)
gre = GRE(str(p[IPv6].payload))
self.checkInner(gre, isv4)
if (encap == 'l3dsr'):
elif (encap == 'l3dsr'):
ip = p[IP]
asid = int(ip.dst.split(".")[3])
self.assertEqual(ip.version, 4)
@ -143,6 +145,33 @@ class TestLB(VppTestCase):
self.assertEqual(ip.dst, "10.0.0.%u" % asid)
self.assertEqual(ip.tos, 0x1c)
self.assertEqual(len(ip.options), 0)
elif (encap == 'nat4'):
ip = p[IP]
asid = int(ip.dst.split(".")[3])
self.assertEqual(ip.version, 4)
self.assertEqual(ip.flags, 0)
self.assertEqual(ip.dst, "10.0.0.%u" % asid)
self.assertEqual(ip.proto, 17)
self.assertEqual(len(ip.options), 0)
self.assertGreaterEqual(ip.ttl, 63)
udp = p[UDP]
self.assertEqual(udp.dport, 3307)
elif (encap == 'nat6'):
ip = p[IPv6]
asid = ip.dst.split(":")
asid = asid[len(asid) - 1]
asid = 0 if asid == "" else int(asid)
self.assertEqual(ip.version, 6)
self.assertEqual(ip.tc, 0)
self.assertEqual(ip.fl, 0)
self.assertEqual(
socket.inet_pton(socket.AF_INET6, ip.dst),
socket.inet_pton(socket.AF_INET6, "2002::%u" % asid)
)
self.assertEqual(ip.nh, 17)
self.assertGreaterEqual(ip.hlim, 63)
udp = UDP(str(p[IPv6].payload))
self.assertEqual(udp.dport, 3307)
load[asid] += 1
except:
self.logger.error(ppp("Unexpected or invalid packet:", p))
@ -246,3 +275,43 @@ class TestLB(VppTestCase):
self.vapi.cli("lb as 90.0.0.0/8 10.0.0.%u del" % (asid))
self.vapi.cli("lb vip 90.0.0.0/8 encap l3dsr dscp 7 del")
self.vapi.cli("test lb flowtable flush")
def test_lb_ip4_nat4(self):
""" Load Balancer IP4 NAT4 """
try:
self.vapi.cli("lb vip 90.0.0.0/8 encap nat4"
" type clusterip port 3306 target_port 3307")
for asid in self.ass:
self.vapi.cli("lb as 90.0.0.0/8 10.0.0.%u" % (asid))
self.pg0.add_stream(self.generatePackets(self.pg0, isv4=True))
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
self.checkCapture(encap='nat4', isv4=True)
finally:
for asid in self.ass:
self.vapi.cli("lb as 90.0.0.0/8 10.0.0.%u del" % (asid))
self.vapi.cli("lb vip 90.0.0.0/8 encap nat4"
" type clusterip port 3306 target_port 3307 del")
self.vapi.cli("test lb flowtable flush")
def test_lb_ip6_nat6(self):
""" Load Balancer IP6 NAT6 """
try:
self.vapi.cli("lb vip 2001::/16 encap nat6"
" type clusterip port 3306 target_port 3307")
for asid in self.ass:
self.vapi.cli("lb as 2001::/16 2002::%u" % (asid))
self.pg0.add_stream(self.generatePackets(self.pg0, isv4=False))
self.pg_enable_capture(self.pg_interfaces)
self.pg_start()
self.checkCapture(encap='nat6', isv4=False)
finally:
for asid in self.ass:
self.vapi.cli("lb as 2001::/16 2002::%u del" % (asid))
self.vapi.cli("lb vip 2001::/16 encap nat6"
" type clusterip port 3306 target_port 3307 del")
self.vapi.cli("test lb flowtable flush")