环境: centos7.6, openvsitch 2.11.0

一、创建模拟接口
  • k8s01 节点执行

安装 openvswitch,需要安装 openstack yum 源

yum install -y openvswitch
systemctl start openvswitch

k8s01 创建、配置 vmA1 接口

[root@k8s01 ~]# ip link add dev vmA1-sw type veth peer name vmA1
[root@k8s01 ~]# ip link set vmA1-sw up
[root@k8s01 ~]# ip link set vmA1 up
[root@k8s01 ~]# ip addr add 192.168.60.11/24 dev vmA1

k8s01 创建、配置 vmB1 接口

[root@k8s01 ~]# ip link add dev vmB1-sw type veth peer name vmB1
[root@k8s01 ~]# ip link set vmB1-sw up
[root@k8s01 ~]# ip link set vmB1 up
[root@k8s01 ~]# ip addr add 192.168.70.11/24 dev vmB1

创建 tenantA 和 tenantB 网桥,它们代表虚拟交换机

[root@k8s01 31956]# ovs-vsctl add-br tenantA
[root@k8s01 31956]# ovs-vsctl add-br tenantB

将模拟接口插到网桥上

[root@k8s01 ~]# ovs-vsctl add-port tenantA vmA1-sw
[root@k8s01 ~]# ovs-vsctl add-port tenantB vmB1-sw
[root@k8s01 ~]# ovs-vsctl show
2d3e5812-033a-4641-9064-1268e693c49a
    Bridge tenantA
        Port tenantA
            Interface tenantA
                type: internal
        Port "vmA1-sw"
            Interface "vmA1-sw"
    Bridge tenantB
        Port tenantB
            Interface tenantB
                type: internal
        Port "vmB1-sw"
            Interface "vmB1-sw"
    ovs_version: "2.11.0"
  • k8s02 节点执行

安装 openvswitch,需要安装 openstack yum 源

yum install -y openvswitch
systemctl start openvswitch

k8s02 创建、配置 vmA2 接口

[root@k8s02 ~]# ip link add dev vmA2-sw type veth peer name vmA2
[root@k8s02 ~]# ip link set vmA2-sw up
[root@k8s02 ~]# ip link set vmA2 up
[root@k8s02 ~]# ip addr add 192.168.60.12/24 dev vmA2

k8s02 创建、配置 vmB2接口

[root@k8s02 ~]# ip link add dev vmB2-sw type veth peer name vmB2
[root@k8s02 ~]# ip link set vmB2-sw up
[root@k8s02 ~]# ip link set vmB2 up
[root@k8s02 ~]# ip addr add 192.168.70.12/24 dev vmB2

创建 tenantA 和 tenantB 网桥,它们代表虚拟交换机

[root@k8s02 31956]# ovs-vsctl add-br tenantA
[root@k8s02 31956]# ovs-vsctl add-br tenantB

将模拟接口插到网桥上

[root@k8s02 ~]# ovs-vsctl add-port tenantA vmA2-sw
[root@k8s02 ~]# ovs-vsctl add-port tenantB vmB2-sw
[root@k8s02 ~]# ovs-vsctl show
2d3e5812-033a-4641-9064-1268e693c49a
    Bridge tenantA
        Port tenantA
            Interface tenantA
                type: internal
        Port "vmA2-sw"
            Interface "vmA2-sw"
    Bridge tenantB
        Port tenantB
            Interface tenantB
                type: internal
        Port "vmB2-sw"
            Interface "vmB2-sw"
    ovs_version: "2.11.0"
  • 在 k8s02 上 ping k8s01 模拟接口,不通
[root@k8s02 ~]# ping -c 2 192.168.60.11
PING 192.168.60.11 (192.168.60.11) 56(84) bytes of data.
From 192.168.60.12 icmp_seq=1 Destination Host Unreachable
From 192.168.60.12 icmp_seq=2 Destination Host Unreachable

--- 192.168.60.11 ping statistics ---
2 packets transmitted, 0 received, +2 errors, 100% packet loss, time 999ms
pipe 2
[root@k8s02 ~]# ping -c 2 192.168.70.11
PING 192.168.70.11 (192.168.70.11) 56(84) bytes of data.
From 192.168.70.12 icmp_seq=1 Destination Host Unreachable
From 192.168.70.12 icmp_seq=2 Destination Host Unreachable

--- 192.168.70.11 ping statistics ---
2 packets transmitted, 0 received, +2 errors, 100% packet loss, time 999ms
pipe 2

注意事项:

1、使用 ovs-vctl 报错

[root@k8s02 ~]# ovs-vsctl add-br tenantA
net_mlx5: cannot load glue library: /lib64/libmlx5.so.1: version `MLX5_1.6' not found (required by /usr/lib64/dpdk-pmds-glue/librte_pmd_mlx5_glue.so.18.11.0)
net_mlx5: cannot initialize PMD due to missing run-time dependency on rdma-core libraries (libibverbs, libmlx5)

解决方法

yum install libmlx5
二、创造 vm 之间的 vxlan 管道
  • k8s01 节点执行
[root@k8s01 ~]# ovs-vsctl add-port tenantA vxlanA -- set interface vxlanA type=vxlan options:remote_ip=10.2.7.201 options:key=5000
[root@k8s01 ~]# ovs-vsctl add-port tenantB vxlanB -- set interface vxlanB type=vxlan options:remote_ip=10.2.7.201 options:key=6000
[root@k8s01 ~]# ovs-vsctl show
2d3e5812-033a-4641-9064-1268e693c49a
    Bridge tenantA
        Port tenantA
            Interface tenantA
                type: internal
        Port "vmA1-sw"
            Interface "vmA1-sw"
        Port vxlanA
            Interface vxlanA
                type: vxlan
                options: {key="5000", remote_ip="10.2.7.201"}
    Bridge tenantB
        Port tenantB
            Interface tenantB
                type: internal
        Port vxlanB
            Interface vxlanB
                type: vxlan
                options: {key="6000", remote_ip="10.2.7.201"}
        Port "vmB1-sw"
            Interface "vmB1-sw"
    ovs_version: "2.11.0"
  • k8s02 节点执行
[root@k8s02 ~]# ovs-vsctl add-port tenantA vxlanA -- set interface vxlanA type=vxlan options:remote_ip=10.2.7.200 options:key=5000
[root@k8s02 ~]# ovs-vsctl add-port tenantB vxlanB -- set interface vxlanB type=vxlan options:remote_ip=10.2.7.200 options:key=6000
[root@k8s02 ~]#  ovs-vsctl show
c3b22dc3-7884-41cc-98c8-da60451491c3
    Bridge tenantA
        Port vxlanA
            Interface vxlanA
                type: vxlan
                options: {key="5000", remote_ip="10.2.7.200"}
        Port tenantA
            Interface tenantA
                type: internal
        Port "vmA2-sw"
            Interface "vmA2-sw"
    Bridge tenantB
        Port vxlanB
            Interface vxlanB
                type: vxlan
                options: {key="6000", remote_ip="10.2.7.200"}
        Port "vmB2-sw"
            Interface "vmB2-sw"
        Port tenantB
            Interface tenantB
                type: internal
    ovs_version: "2.11.0"
  • 在 k8s02 上 ping k8s01 模拟接口,能通
[root@k8s02 ~]# ping -c 2 192.168.60.11
PING 192.168.60.11 (192.168.60.11) 56(84) bytes of data.
64 bytes from 192.168.60.11: icmp_seq=1 ttl=64 time=1.81 ms
64 bytes from 192.168.60.11: icmp_seq=2 ttl=64 time=0.279 ms

--- 192.168.60.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.279/1.048/1.818/0.770 ms
[root@k8s02 ~]# ping -c 2 192.168.70.11
PING 192.168.70.11 (192.168.70.11) 56(84) bytes of data.
64 bytes from 192.168.70.11: icmp_seq=1 ttl=64 time=1.21 ms
64 bytes from 192.168.70.11: icmp_seq=2 ttl=64 time=0.204 ms

--- 192.168.70.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1000ms
rtt min/avg/max/mdev = 0.204/0.710/1.217/0.507 ms
三、openvswitch 控制命令

ovs-vsctl 是配置 openvswitch 最重要的命令,ovs-vsctl 与 ovsdb-server 通信,ovsdb-server 管理着 openvswitch 的配置信息数据库。

ovs-vsctl 一次可以执行一条或多条命令,多条命令用 “–” 分开

帮助信息:ovs-vsctl --helpman 5 ovs-vswitchd.conf.dbovs-vsctl list

显示已存在网桥的 db 信息

[root@k8s01 ~]#  ovs-vsctl list bridge
_uuid               : ed1cf240-72b2-4d0d-80ba-34fe4319805f
auto_attach         : []
controller          : []
datapath_id         : "000042f21ced0d4d"
datapath_type       : ""
datapath_version    : "<unknown>"
external_ids        : {}
fail_mode           : []
flood_vlans         : []
flow_tables         : {}
ipfix               : []
mcast_snooping_enable: false
mirrors             : []
name                : tenantB
netflow             : []
other_config        : {}
ports               : [05e579af-f158-4a27-b046-3d45a63f463f, a33d043c-2a82-4f52-8c10-4a28eab38345, b17ade3a-31bf-4cd2-8f1c-8819b82c53aa]
protocols           : []
rstp_enable         : false
rstp_status         : {}
sflow               : []
status              : {}
stp_enable          : false

_uuid               : af7cbcf0-95c4-4f3e-a89a-6eb5a2a44154
auto_attach         : []
controller          : []
datapath_id         : "0000f2bc7caf3e4f"
datapath_type       : ""
datapath_version    : "<unknown>"
external_ids        : {}
fail_mode           : []
flood_vlans         : []
flow_tables         : {}
ipfix               : []
mcast_snooping_enable: false
mirrors             : []
name                : tenantA
netflow             : []
other_config        : {}
ports               : [a631ac6c-6039-4171-812a-aefc24487ac0, f26cfa5d-fd35-439a-a934-86bdfe632d5e, f87dd940-1d71-437a-8694-a003c25c7c7b]
protocols           : []
rstp_enable         : false
rstp_status         : {}
sflow               : []
status              : {}
stp_enable          : false

显示网桥 tenanaA 的 db 信息

[root@k8s01 ~]# ovs-vsctl list bridge tenantA
_uuid               : af7cbcf0-95c4-4f3e-a89a-6eb5a2a44154
auto_attach         : []
controller          : []
datapath_id         : "0000f2bc7caf3e4f"
datapath_type       : ""
datapath_version    : "<unknown>"
external_ids        : {}
fail_mode           : []
flood_vlans         : []
flow_tables         : {}
ipfix               : []
mcast_snooping_enable: false
mirrors             : []
name                : tenantA
netflow             : []
other_config        : {}
ports               : [a631ac6c-6039-4171-812a-aefc24487ac0, f26cfa5d-fd35-439a-a934-86bdfe632d5e, f87dd940-1d71-437a-8694-a003c25c7c7b]
protocols           : []
rstp_enable         : false
rstp_status         : {}
sflow               : []
status              : {}
stp_enable          : false
四、openvswitch port

传统的交换机端口是 2 层端口,流量在该交换机端口之间传输,这些端口没有 3 层的 ip 配置。即使是的 linux bridge 也能发现这些特性,比如 eth0 配置了 ip,再把它加入到一个 bridge,那么你将失去 eth0 的连接,因为 eth0 此时只作为 2 层端口,可以将 ip 移到连接 eth0 的 bridge 接口(比如 br0)

internal port

openvswith 用 internal port 提供一个解决方法。 internal port 是一个 3 层端口,能够暴露到 openvswitch 外面,所以能够进行 ip 设置

k8s01 上配置

[root@k8s01 ~]#  ovs-vsctl add-port tenantA internalPort -- set interface internalPort type=internal
[root@k8s01 ~]# ip addr add 192.168.60.50/24 dev internalPort
[root@k8s01 ~]# ip link set internalPort up

k8s02 上 ping 192.168.60.50

[root@k8s02 ~]#  ping -c2 192.168.60.50
PING 192.168.60.50 (192.168.60.50) 56(84) bytes of data.
64 bytes from 192.168.60.50: icmp_seq=1 ttl=64 time=1.36 ms
64 bytes from 192.168.60.50: icmp_seq=2 ttl=64 time=0.162 ms

--- 192.168.60.50 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.162/0.763/1.364/0.601 ms

internal port 的名称就像网桥名称一样

[root@k8s01 ~]# ovs-vsctl show
2d3e5812-033a-4641-9064-1268e693c49a
    Bridge tenantA
        Port tenantA
            Interface tenantA
                type: internal
        Port internalPort
            Interface internalPort
                type: internal
        Port "vmA1-sw"
            Interface "vmA1-sw"
        Port vxlanA
            Interface vxlanA
                type: vxlan
                options: {key="5000", remote_ip="10.2.7.201"}
    Bridge tenantB
        Port tenantB
            Interface tenantB
                type: internal
        Port vxlanB
            Interface vxlanB
                type: vxlan
                options: {key="6000", remote_ip="10.2.7.201"}
        Port "vmB1-sw"
            Interface "vmB1-sw"
    ovs_version: "2.11.0
[root@k8s01 ~]# ip link | grep tenant
197: tenantA: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
198: tenantB: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000

mirror port

交换机有 mirror port 和 span prot,该 port 可以作为特定流量的镜像,用于拍错。 openvswith 也支持 mirror port

创建一个内部的 mirror port,我们需要一个端口并在上面进行 wireshark/tcpdump 监听

[root@k8s01 ~]# ovs-vsctl add-port tenantA mirrorPort -- set interface mirrorPort type=internal

创建 mirror 配置

[root@k8s01 ~]# ovs-vsctl --id=@vmA1-sw get port vmA1-sw --\
>                           --id=@mirrorPort get port mirrorPort --\
>                           --id=@mirror create mirror name=mirror \
>                           select-dst-port=@vmA1-sw select-src-port=@vmA1-sw output-port=@mirrorPort --\
>                           set bridge tenantA mirrors=@mirror
8d73fe4f-b111-4f8f-a3d1-b8672dbe037a

安装 wireshark

[root@k8s01 ~]# yum -y install wireshark

k8s02 ping

[root@k8s02 ~]#  ping  192.168.60.11
PING 192.168.60.11 (192.168.60.11) 56(84) bytes of data.
64 bytes from 192.168.60.11: icmp_seq=1 ttl=64 time=0.966 ms
64 bytes from 192.168.60.11: icmp_seq=2 ttl=64 time=0.211 ms
64 bytes from 192.168.60.11: icmp_seq=3 ttl=64 time=0.193 ms
64 bytes from 192.168.60.11: icmp_seq=4 ttl=64 time=0.235 ms
64 bytes from 192.168.60.11: icmp_seq=5 ttl=64 time=0.277 ms
...

监听 mirror 端口

[root@k8s01 ~]# ip link set mirrorPort up
[root@k8s01 ~]# tshark -c 6 -i mirrorPort
Running as user "root" and group "root". This could be dangerous.
Capturing on 'mirrorPort'
  1 0.000000000 192.168.60.12 -> 192.168.60.11 ICMP 98 Echo (ping) request  id=0x7aec, seq=1/256, ttl=64
  2 0.000140651 192.168.60.11 -> 192.168.60.12 ICMP 98 Echo (ping) reply    id=0x7aec, seq=1/256, ttl=64 (request in 1)
  3 0.999594777 192.168.60.12 -> 192.168.60.11 ICMP 98 Echo (ping) request  id=0x7aec, seq=2/512, ttl=64
  4 0.999626499 192.168.60.11 -> 192.168.60.12 ICMP 98 Echo (ping) reply    id=0x7aec, seq=2/512, ttl=64 (request in 3)
  5 1.999676670 192.168.60.12 -> 192.168.60.11 ICMP 98 Echo (ping) request  id=0x7aec, seq=3/768, ttl=64
  6 1.999707851 192.168.60.11 -> 192.168.60.12 ICMP 98 Echo (ping) reply    id=0x7aec, seq=3/768, ttl=64 (request in 5)
6 packets captured

监听 eth0 端口,发现 192.168.60.12 > 192.168.60.11 流量是封装成 vxlan 通过 eth0 发送

[root@k8s01 ~]# tcpdump -i eth0 host 10.2.7.201 -n | grep -C2 192.168.60.11
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
09:09:44.905533 IP 10.2.7.200.51404 > 10.2.7.201.2380: Flags [.], ack 1949, win 1424, options [nop,nop,TS val 2565202554 ecr 1863325088], length 0
09:09:44.906178 IP 10.2.7.201.53192 > 10.2.7.200.4789: VXLAN, flags [I] (0x08), vni 5000
IP 192.168.60.12 > 192.168.60.11: ICMP echo request, id 31344, seq 9, length 64
09:09:44.906281 IP 10.2.7.200.39328 > 10.2.7.201.4789: VXLAN, flags [I] (0x08), vni 5000
IP 192.168.60.11 > 192.168.60.12: ICMP echo reply, id 31344, seq 9, length 64
09:09:44.906463 IP 10.2.7.201.ssh > 10.2.7.107.55590: Flags [P.], seq 3157118280:3157118380, ack 17960795, win 386, options [nop,nop,TS val 1863325089 ecr 4229146211], length 100
09:09:44.906569 IP 10.2.7.107.55590 > 10.2.7.201.ssh: Flags [.], ack 100, win 1424, options [nop,nop,TS val 4229147211 ecr 1863325089], length 0
--
...

清理 mirror

[root@k8s01 ~]# ovs-vsctl clear bridge tenantA mirrors
[root@k8s01 ~]# ovs-vsctl del-port mirrorPort

注意:mirror 设置中 --id 命令提供接口的别名,需要接口的 id 而不是名称。不能给 patch ports 创建 mirror,如果需要 mirror path ports 流量,请使用 veth pairs。

patch port

patch ports 就像连接两个交换机之间的电缆,或者插入 openvswitch 网桥的 veth pair

创建 tenantC 网桥以及设置 tenantC internal port

k8s01 节点

[root@k8s01 ~]# ovs-vsctl add-br tenantC
[root@k8s01 ~]# ip addr add 192.168.80.11/24 dev tenantC
[root@k8s01 ~]# ip link set tenantC up

k8s02 节点

[root@k8s02 ~]# ovs-vsctl add-br tenantC
[root@k8s02 ~]# ip addr add 192.168.80.12/24 dev tenantC
[root@k8s02 ~]# ip link set tenantC up

k8s02 ping 192.168.80.11,没有 patch 连接,所以不通

[root@k8s02 ~]#  ping -c2 192.168.80.11
PING 192.168.80.11 (192.168.80.11) 56(84) bytes of data.
From 192.168.80.12 icmp_seq=1 Destination Host Unreachable
From 192.168.80.12 icmp_seq=2 Destination Host Unreachable

--- 192.168.80.11 ping statistics ---
2 packets transmitted, 0 received, +2 errors, 100% packet loss, time 999ms
pipe 2

建立 path 连接

k8s01 节点

[root@k8s01 ~]# ovs-vsctl add-port tenantC patchC --\
>                           add-port tenantA patchA --\
>                           set interface patchC type=patch options:peer=patchA --\
>                           set interface patchA type=patch options:peer=patchC

k8s02 节点

[root@k8s02 ~]#  ovs-vsctl add-port tenantC patchC --\
>                           add-port tenantA patchA --\
>                           set interface patchC type=patch options:peer=patchA --\
>                           set interface patchA type=patch options:peer=patchC

k8s02 ping 192.168.80.11

[root@k8s02 ~]# ping -c2 192.168.80.11
PING 192.168.80.11 (192.168.80.11) 56(84) bytes of data.
64 bytes from 192.168.80.11: icmp_seq=1 ttl=64 time=1.47 ms
64 bytes from 192.168.80.11: icmp_seq=2 ttl=64 time=0.189 ms

--- 192.168.80.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.189/0.833/1.478/0.645 ms
vlans

k8s01 节点

[root@k8s01 ~]# ovs-vsctl add-port tenantC vlanPortC tag=10 --\
>                 set interface vlanPortC type=internal
[root@k8s01 ~]# ip addr add 192.168.90.11/24 dev vlanPortC
[root@k8s01 ~]# ip link set vlanPortC up

k8s02 节点

[root@k8s02 ~]#  ovs-vsctl add-port tenantC vlanPortC tag=10 --\
>                 set interface vlanPortC type=internal
[root@k8s02 ~]# ip addr add 192.168.90.12/24 dev vlanPortC
[root@k8s02 ~]#  ip link set vlanPortC up

k8s02 ping 192.168.90.11

[root@k8s02 ~]# ping -c2 192.168.90.11
PING 192.168.90.11 (192.168.90.11) 56(84) bytes of data.
64 bytes from 192.168.90.11: icmp_seq=1 ttl=64 time=1.67 ms
64 bytes from 192.168.90.11: icmp_seq=2 ttl=64 time=0.178 ms

--- 192.168.90.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.178/0.927/1.676/0.749 ms

监听 eth0 端口,发现 192.168.90.12 > 192.168.90.11 流量是封装成 vxlan 通过 eth0 发送

[root@k8s01 ~]# tcpdump -i eth0 -n| grep -C2 192.168.90.12
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
10:28:34.066375 IP 10.2.7.202.46040 > 10.2.7.200.2380: Flags [.], ack 886, win 1424, options [nop,nop,TS val 1172533648 ecr 2569931715], length 0
10:28:34.068319 IP 10.2.7.201.42349 > 10.2.7.200.4789: VXLAN, flags [I] (0x08), vni 5000
IP 192.168.90.12 > 192.168.90.11: ICMP echo request, id 19832, seq 134, length 64
10:28:34.068389 IP 10.2.7.200.49438 > 10.2.7.201.4789: VXLAN, flags [I] (0x08), vni 5000
IP 192.168.90.11 > 192.168.90.12: ICMP echo reply, id 19832, seq 134, length 64
10:28:34.068592 IP 10.2.7.201.ssh > 10.2.7.107.55590: Flags [P.], seq 3157289604:3157289704, ack 17976739, win 431, options [nop,nop,TS val 1868054235 ecr 4233875337], length 100
..
vxlan 和 gre ports

vxlan 和 gre 很相似。移除 tenantA 和 tenantC 之间的 patch,gre 管道将处理 tenanC 网桥之间的流量信息

删除 patch ports

k8s01 节点

k8s01 节点[root@k8s01 ~]# ovs-vsctl del-port patchC
[root@k8s01 ~]# ovs-vsctl del-port patchA

k8s02 节点

[root@k8s02 ~]# ovs-vsctl del-port patchC
[root@k8s02 ~]# ovs-vsctl del-port patchA

k8s02 ping 192.168.90.11,不通

[root@k8s02 ~]#  ping -c2 192.168.90.11
PING 192.168.90.11 (192.168.90.11) 56(84) bytes of data.
From 192.168.90.12 icmp_seq=1 Destination Host Unreachable
From 192.168.90.12 icmp_seq=2 Destination Host Unreachable

--- 192.168.90.11 ping statistics ---
2 packets transmitted, 0 received, +2 errors, 100% packet loss, time 999ms
pipe 2

增加 gre 管道

k8s01 节点

[root@k8s01 ~]# ovs-vsctl add-port tenantC greC -- set interface greC type=gre options:remote_ip=10.2.7.201

k8s02 节点

[root@k8s02 ~]# ovs-vsctl add-port tenantC greC -- set interface greC type=gre options:remote_ip=10.2.7.200

k8s02 ping 192.168.90.11

[root@k8s02 ~]# ping -c2 192.168.90.11
PING 192.168.90.11 (192.168.90.11) 56(84) bytes of data.
64 bytes from 192.168.90.11: icmp_seq=1 ttl=64 time=1.28 ms
64 bytes from 192.168.90.11: icmp_seq=2 ttl=64 time=0.173 ms

--- 192.168.90.11 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.173/0.729/1.286/0.557 ms

监听 eth0 端口,发现 192.168.90.12 > 192.168.90.11 流量是封装成 GREv0 通过 eth0 发送

[root@k8s01 ~]# tcpdump  -n -i eth0 | grep -C2 192.168.90.12
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
10:50:38.842380 IP 10.2.7.201.ssh > 10.2.7.107.55590: Flags [P.], seq 1988:2080, ack 393, win 431, options [nop,nop,TS val 1869379008 ecr 4235201072], length 92
10:50:38.842472 IP 10.2.7.107.55590 > 10.2.7.201.ssh: Flags [.], ack 2080, win 1424, options [nop,nop,TS val 4235201101 ecr 1869379008], length 0
10:50:38.842498 IP 10.2.7.201 > 10.2.7.200: GREv0, length 106: IP 192.168.90.12 > 192.168.90.11: ICMP echo request, id 3857, seq 1, length 64
10:50:38.842607 IP 10.2.7.107.ssh > 10.3.57.11.34344: Flags [P.], seq 2640:2752, ack 641, win 399, options [nop,nop,TS val 4235201101 ecr 3872193733], length 112
10:50:38.842731 IP 10.3.57.11.34344 > 10.2.7.107.ssh: Flags [.], ack 2752, win 2440, options [nop,nop,TS val 3872193762 ecr 4235201101], length 0
10:50:38.842902 IP 10.2.7.200 > 10.2.7.201: GREv0, length 106: IP 192.168.90.11 > 192.168.90.12: ICMP echo reply, id 3857, seq 1, length 64
10:50:38.843231 IP 10.2.7.201.ssh > 10.2.7.107.55590: Flags [P.], seq 2080:2180, ack 393, win 431, options [nop,nop,TS val 1869379009 ecr 4235201101], length 100
10:50:38.843267 IP 10.2.7.107.55590 > 10.2.7.201.ssh: Flags [.], ack 2180, win 1424, options [nop,nop,TS val 4235201101 ecr 1869379009], length 0
--
...

监听 gre_sys 端口,发现 192.168.90.12 > 192.168.90.11 会通过 gre_sys 接口

[root@k8s01 ~]# tcpdump  -n -i gre_sys 
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on gre_sys, link-type EN10MB (Ethernet), capture size 262144 bytes
10:49:28.236776 IP 192.168.80.12 > 192.168.80.11: ICMP echo request, id 2707, seq 36, length 64
10:49:28.236827 IP 192.168.80.11 > 192.168.80.12: ICMP echo reply, id 2707, seq 36, length 64
10:49:29.236759 IP 192.168.80.12 > 192.168.80.11: ICMP echo request, id 2707, seq 37, length 64
10:49:29.236809 IP 192.168.80.11 > 192.168.80.12: ICMP echo reply, id 2707, seq 37, length 64
10:49:30.236728 IP 192.168.80.12 > 192.168.80.11: ICMP echo request, id 2707, seq 38, length 64
10:49:30.236773 IP 192.168.80.11 > 192.168.80.12: ICMP echo reply, id 2707, seq 38, length 64
...
五、mtu 问题

mtu 最大传输单元,一般最大值为 1500,超过就会切分。vxlan 使用封装技术,传输单元可能会超过 底层网络最大传输单元 1500,这回导致问题,比如下面。

k8s02 作为server

[root@k8s02 ~]# iperf3 -s
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.70.11, port 40230
[  5] local 192.168.70.12 port 5201 connected to 192.168.70.11 port 40232
[ ID] Interval           Transfer     Bandwidth
[  5]   0.00-1.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   1.00-2.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   2.00-3.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   5.00-6.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   6.00-7.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   7.00-8.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   8.00-9.00   sec  0.00 Bytes  0.00 bits/sec                  
[  5]   9.00-10.00  sec  0.00 Bytes  0.00 bits/sec                  
[  5]  10.00-10.04  sec  0.00 Bytes  0.00 bits/sec                  
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth
[  5]   0.00-10.04  sec  0.00 Bytes  0.00 bits/sec                  sender
[  5]   0.00-10.04  sec  0.00 Bytes  0.00 bits/sec                  receiver
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
...

k8s01 作为 client

[root@k8s01 ~]# iperf3 -c 192.168.70.12
Connecting to host 192.168.70.12, port 5201
[  4] local 192.168.70.11 port 40232 connected to 192.168.70.12 port 5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec  84.8 KBytes   694 Kbits/sec    2   1.41 KBytes       
[  4]   1.00-2.00   sec  0.00 Bytes  0.00 bits/sec    1   1.41 KBytes       
[  4]   2.00-3.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes       
[  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec    1   1.41 KBytes       
[  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes       
[  4]   5.00-6.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes       
[  4]   6.00-7.00   sec  0.00 Bytes  0.00 bits/sec    1   1.41 KBytes       
[  4]   7.00-8.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes       
[  4]   8.00-9.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes       
[  4]   9.00-10.00  sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-10.00  sec  84.8 KBytes  69.5 Kbits/sec    5             sender
[  4]   0.00-10.00  sec  0.00 Bytes  0.00 bits/sec                  receiver

iperf Done.

  • 解决方法 1

为了避免这样的为题,必须调整底层网络的 mtu。通过计算额外的 header,VXLAN + UDP + IP + Ethernet,得到 mtu 1554

[root@k8s02 ~]# ip link set eth0 mtu 1554
[root@k8s02 ~]# iperf3 -s
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.70.11, port 45934
[  5] local 192.168.70.12 port 5201 connected to 192.168.70.11 port 45936
[ ID] Interval           Transfer     Bandwidth
[  5]   0.00-1.00   sec   316 MBytes  2.65 Gbits/sec                  
[  5]   1.00-2.00   sec   329 MBytes  2.76 Gbits/sec                  
[  5]   2.00-3.00   sec   356 MBytes  2.99 Gbits/sec                  
[  5]   3.00-4.00   sec   353 MBytes  2.96 Gbits/sec                  
[  5]   4.00-5.00   sec   353 MBytes  2.96 Gbits/sec                  
[  5]   5.00-6.00   sec   344 MBytes  2.88 Gbits/sec                  
[  5]   6.00-7.00   sec   348 MBytes  2.92 Gbits/sec                  
[  5]   7.00-8.00   sec   348 MBytes  2.92 Gbits/sec                  
[  5]   8.00-9.00   sec   342 MBytes  2.87 Gbits/sec                  
[  5]   9.00-10.00  sec   344 MBytes  2.89 Gbits/sec                  
[  5]  10.00-10.04  sec  15.1 MBytes  2.99 Gbits/sec                  
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth
[  5]   0.00-10.04  sec  0.00 Bytes  0.00 bits/sec                  sender
[  5]   0.00-10.04  sec  3.37 GBytes  2.88 Gbits/sec                  receiver
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
[root@k8s01 ~]# ip link set eth0 mtu 1554
[root@k8s01 ~]# iperf3 -c 192.168.70.12
Connecting to host 192.168.70.12, port 5201
[  4] local 192.168.70.11 port 45936 connected to 192.168.70.12 port 5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   332 MBytes  2.79 Gbits/sec    8    730 KBytes       
[  4]   1.00-2.00   sec   330 MBytes  2.77 Gbits/sec    0   1022 KBytes       
[  4]   2.00-3.00   sec   356 MBytes  2.99 Gbits/sec    0   1.23 MBytes       
[  4]   3.00-4.00   sec   352 MBytes  2.96 Gbits/sec   32   1.42 MBytes       
[  4]   4.00-5.00   sec   352 MBytes  2.96 Gbits/sec    0   1.59 MBytes       
[  4]   5.00-6.00   sec   345 MBytes  2.89 Gbits/sec   14   1.30 MBytes       
[  4]   6.00-7.00   sec   348 MBytes  2.92 Gbits/sec    0   1.49 MBytes       
[  4]   7.00-8.00   sec   349 MBytes  2.93 Gbits/sec    9   1.19 MBytes       
[  4]   8.00-9.00   sec   341 MBytes  2.86 Gbits/sec    0   1.39 MBytes       
[  4]   9.00-10.00  sec   345 MBytes  2.90 Gbits/sec   32   1.55 MBytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-10.00  sec  3.37 GBytes  2.89 Gbits/sec   95             sender
[  4]   0.00-10.00  sec  3.37 GBytes  2.89 Gbits/sec                  receiver

iperf Done.
  • 解决方法 2

也可以减少 vmB1 的 mtu

[root@k8s02 ~]# ip link set vmB2 mtu 1450
[root@k8s02 ~]# ip link set vmB2-sw mtu 1450
[root@k8s02 ~]# iperf3 -s
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.70.11, port 52206
[  5] local 192.168.70.12 port 5201 connected to 192.168.70.11 port 52208
[ ID] Interval           Transfer     Bandwidth
[  5]   0.00-1.00   sec   298 MBytes  2.50 Gbits/sec                  
[  5]   1.00-2.00   sec   326 MBytes  2.73 Gbits/sec                  
[  5]   2.00-3.00   sec   327 MBytes  2.75 Gbits/sec                  
[  5]   3.00-4.00   sec   323 MBytes  2.71 Gbits/sec                  
[  5]   4.00-5.00   sec   334 MBytes  2.80 Gbits/sec                  
[  5]   5.00-6.00   sec   341 MBytes  2.86 Gbits/sec                  
[  5]   6.00-7.00   sec   342 MBytes  2.87 Gbits/sec                  
[  5]   7.00-8.00   sec   344 MBytes  2.89 Gbits/sec                  
[  5]   8.00-9.00   sec   345 MBytes  2.89 Gbits/sec                  
[  5]   9.00-10.00  sec   341 MBytes  2.86 Gbits/sec                  
[  5]  10.00-10.04  sec  13.9 MBytes  2.86 Gbits/sec                  
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth
[  5]   0.00-10.04  sec  0.00 Bytes  0.00 bits/sec                  sender
[  5]   0.00-10.04  sec  3.26 GBytes  2.79 Gbits/sec                  receiver
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
[root@k8s01 ~]# ip link set vmA1 mtu 1450
[root@k8s01 ~]# ip link set vmA1-sw mtu 1450
[root@k8s01 ~]# iperf3 -c 192.168.70.12
Connecting to host 192.168.70.12, port 5201
[  4] local 192.168.70.11 port 52208 connected to 192.168.70.12 port 5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   312 MBytes  2.61 Gbits/sec   47    754 KBytes       
[  4]   1.00-2.00   sec   328 MBytes  2.75 Gbits/sec    0   1.00 MBytes       
[  4]   2.00-3.00   sec   327 MBytes  2.75 Gbits/sec    0   1.21 MBytes       
[  4]   3.00-4.00   sec   322 MBytes  2.71 Gbits/sec    0   1.39 MBytes       
[  4]   4.00-5.00   sec   334 MBytes  2.80 Gbits/sec    0   1.55 MBytes       
[  4]   5.00-6.00   sec   341 MBytes  2.86 Gbits/sec   23   1.28 MBytes       
[  4]   6.00-7.00   sec   342 MBytes  2.87 Gbits/sec   33   1.45 MBytes       
[  4]   7.00-8.00   sec   345 MBytes  2.89 Gbits/sec    1   1.17 MBytes       
[  4]   8.00-9.00   sec   344 MBytes  2.88 Gbits/sec    0   1.37 MBytes       
[  4]   9.00-10.00  sec   341 MBytes  2.86 Gbits/sec    0   1.53 MBytes       
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-10.00  sec  3.26 GBytes  2.80 Gbits/sec  104             sender
[  4]   0.00-10.00  sec  3.26 GBytes  2.80 Gbits/sec                  receiver

iperf Done.

参考文章:

OpenStack Networking: Open vSwitch and VXLAN introduction

Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐