kubeadm部署3master3node crio(1.24.0)+k8s(1.26.0)

环境说明:
   部署平台:openstack
   网络:可翻墙,网速不好
   系统:	centos7
   主机:
   		192.168.20.127 k8s-master-1		8C16G 
   		192.168.20.32  k8s-master-2		8C16G
   		192.168.20.121 k8s-master-3		8C16G
   		192.168.20.18  k8s-node-1		16C32G
   		192.168.20.93  k8s-node-2		16C32G
   		192.168.20.78  k8s-node-3		16C32G
部署:
1.系统基础配置
#以下操作所有机器执行
0.ip地址最好为静态
1.配置/etc/hosts
2.配置ssh免密登陆
3.禁用se
   	sed -i 's/enforcing/disabled/' /etc/selinux/config
	setenforce 0
	getenforce
4.部署k8s所需要的内核优化
#初始化:
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF

sudo modprobe overlay
sudo modprobe br_netfilter

# 设置所需的 sysctl 参数,参数在重新启动后保持不变
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables  = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward                 = 1
EOF

# 应用 sysctl 参数而不重新启动
sudo sysctl --system
2.部署keepalived+haproxy配置vip
#以下操作所有机器执行
yum install -y keepalived haproxy
cp  /etc/haproxy/haproxy.cfg /etc/haproxy/haproxy.cfg.bak
cp /etc/keepalived/keepalived.conf /etc/keepalived/keepalived.conf.bak

#haproxy配置

cat > /etc/haproxy/haproxy.cfg << EOF
global
    log         127.0.0.1 local2

    chroot      /var/lib/haproxy
    pidfile     /var/run/haproxy.pid
    maxconn     4000
    user        haproxy
    group       haproxy
    daemon

    # turn on stats unix socket
    stats socket /var/lib/haproxy/stats

defaults
    mode                    http
    log                     global
    option                  httplog
    option                  dontlognull
    option http-server-close
    option forwardfor       except 127.0.0.0/8
    option                  redispatch
    retries                 3
    timeout http-request    10s
    timeout queue           1m
    timeout connect         10s
    timeout client          1m
    timeout server          1m
    timeout http-keep-alive 10s
    timeout check           10s
    maxconn                 3000


frontend k8s-master
    bind 0.0.0.0:16443
    bind 127.0.0.1:16443
    mode tcp
    option tcplog
    tcp-request inspect-delay 5s
    default_backend k8s-master
backend k8s-master
    mode tcp
    option tcplog
    option tcp-check
    balance roundrobin
    default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100
    server k8s-master-1 192.168.20.127:6443  check
    server k8s-master-2 192.168.20.32:6443  check
    server k8s-master-3 192.168.20.121:6443  check
EOF

#keepalived配置

cat > /etc/keepalived/keepalived.conf <<EOF 
! Configuration File for keepalived
global_defs {
   router_id k8s
}
vrrp_script check_haproxy {
    script "killall -0 haproxy"
    interval 3
    weight -2
    fall 10
    rise 2
}
vrrp_instance VI_1 {
    state MASTER
    interface eth0
    virtual_router_id 51
    priority 250
    advert_int 1
    authentication {
        auth_type PASS
        auth_pass k8s
    }
    virtual_ipaddress {
        192.168.20.150/24
    }
    track_script {
        check_haproxy
    }
}
EOF
systemctl enable haproxy;systemctl start haproxy
systemctl start keepalived;systemctl start keepalived
ip a | grep eth0 
#验证vip工作状态,是否正常漂移    
3.安装crio+(kubeadm,kubectl,kubelet)

crio指定版本安装
参考https://cri-o.io/
OS指定系统
VERSION指定版本

OS=CentOS_7
VERSION=1.24
export OS VERSION
curl -L -o /etc/yum.repos.d/devel:kubic:libcontainers:stable.repo https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/$OS/devel:kubic:libcontainers:stable.repo
curl -L -o /etc/yum.repos.d/devel:kubic:libcontainers:stable:cri-o:$VERSION.repo https://download.opensuse.org/repositories/devel:kubic:libcontainers:stable:cri-o:$VERSION/$OS/devel:kubic:libcontainers:stable:cri-o:$VERSION.repo
yum install cri-o -y
systemctl enable crio
systemctl start crio

三件套指定版本安装
#参考阿里云开源镜像站
#我安装的时候最新版为1.16,如果需要指定版本,记得yum install 指定版本包

cat <<EOF > /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
enabled=1
gpgcheck=1
repo_gpgcheck=1
gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
EOF
setenforce 0
#yum install -y kubelet kubeadm kubectl
yum install -y kubelet-1.26.0-0.x86_64 kubeadm-1.26.0-0.x86_64 kubectl-1.26.0-0.x86_64
systemctl enable kubelet && systemctl start kubelet

4.配置crio+kubelet
#!所有节点执行
#网段修改操作,记得和kubeadm init文件保持一致
sed -i 's/10.85.0.0/10.244.0.0/g' /etc/cni/net.d/100-crio-bridge.conf
#kubelet 操作:
vim  /usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf	,添加
 --container-runtime=remote --cgroup-driver=systemd --container-runtime-endpoint=unix:///var/run/crio/crio.sock --runtime-request-timeout=5m
systemctl daemon-reload;systemctl restart kubelet
[root@k8s-master-2 ~]# cat /usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf | grep -v "^#"
[Service]
Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf"
Environment="KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml"
EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env
EnvironmentFile=-/etc/sysconfig/kubelet
ExecStart=
ExecStart=/usr/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS --container-runtime=remote --cgroup-driver=systemd --container-runtime-endpoint=unix:///var/run/crio/crio.sock --runtime-request-timeout=5m
#crio配置
[root@k8s-master-2 ~]# cat /etc/crictl.yaml
runtime-endpoint: unix:///var/run/crio/crio.sock
image-endpoint: unix:///var/run/crio/crio.sock
timeout: 10
debug: false
pull-image-on-create: true
disable-pull-on-run: false
#直接删除原文件,写入新文件会报错,建议原文件备份修改,然后重启crio
[root@k8s-master-2 ~]# grep -Env '#|^$|^\[' /etc/crio/crio.conf
136:selinux = true
169:cgroup_manager = "systemd"
455:pause_image = "pww.artifactory.cdi.philips.com/tools/k8s1.26/pause:3.9"
478:insecure_registries = [
479: "docker.io",
480: "pww.artifactory.cdi.philips.com",
481: "registry.access.redhat.com",
482: "quay.io",
483: "registry.aliyuncs.com"
484: ]
505:plugin_dirs = [
506:    "/opt/cni/bin",
507:    "/usr/libexec/cni",
508:]
514:enable_metrics = true
548:metrics_port = 9537

5.生成init配置文件,并为init做准备
#master01执行
	kubeadm config print init-defaults > kubeadm-config.yaml
	
[root@k8s-master-1 ~]# cat kubeadm/kubeadm-config.yaml
apiVersion: kubeadm.k8s.io/v1beta3
bootstrapTokens:
- groups:
  - system:bootstrappers:kubeadm:default-node-token
  token: abcdef.0123456789abcdef
  ttl: 24h0m0s
  usages:
  - signing
  - authentication
kind: InitConfiguration
localAPIEndpoint:
  advertiseAddress: 192.168.20.127 #本机ip
  bindPort: 6443
nodeRegistration:
  criSocket: unix:///var/run/crio/crio.sock  #criosocket path
  imagePullPolicy: IfNotPresent
  #name: node
  taints: null
---
apiServer:
  timeoutForControlPlane: 4m0s
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
controlPlaneEndpoint: "192.168.20.150:16443"	#vip和端口
controllerManager: {}
dns: {}
etcd:
  local:
    dataDir: /var/lib/etcd
imageRepository: pww.artifactory.cdi.com/tools/k8s1.26 #我这里是提前把镜像下载下来导入到内网仓库了,然后可以使用crictl pull 命令提前下载到机器上

kind: ClusterConfiguration
kubernetesVersion: 1.26.0
networking:
  dnsDomain: cluster.local
  podSubnet: 10.244.0.0/16  #pod网段,记得和/etc/cni/net.d/100-crio-bridge.conf对应
  serviceSubnet: 10.1.0.0/16 #service网段,
scheduler: {}
#---
#apiVersion: kubeproxy.config.k8s.io/v1alpha1
#kind: KubeProxyConfiguration
#featureGates:
#  SupportIPVSProxyMode: true
#mode: ipvs

执行以下命令可查看镜像信息

kubeadm config images list --config kubeadm.yml

下载主节点镜像

 kubeadm config images pull --config kubeadm.yml
 #这个命令不好用,个人建议是list镜像信息后,
 #通过其他容器命令buildah,docker pull 镜像,然后上传到私库
 #然后机器上使用crictl pull 私库镜像下载下来再init  
6.kubeadm init
kubeadm reset -f  #失败的话,就排查原因用这个命令重试
kubeadm init --config=kubeadm-config.yaml  | tee kubeadm-init.log
#成功会显示出以下信息
To start using your cluster, you need to run the following as a regular user:

  mkdir -p $HOME/.kube
  sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
  sudo chown $(id -u):$(id -g) $HOME/.kube/config

Alternatively, if you are the root user, you can run:

  export KUBECONFIG=/etc/kubernetes/admin.conf

You should now deploy a pod network to the cluster.
Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at:
  https://kubernetes.io/docs/concepts/cluster-administration/addons/

You can now join any number of control-plane nodes by copying certificate authorities
and service account keys on each node and then running the following as root:

  kubeadm join 192.168.20.150:16443 --token abcdef.0123456789abcdef \
        --discovery-token-ca-cert-hash sha256:6e6990afc6b65416d7b1a2b00c154798dc38c1c611aa02b93754f35870dafae4 \
        --control-plane

Then you can join any number of worker nodes by running the following on each as root:

kubeadm join 192.168.20.150:16443 --token abcdef.0123456789abcdef \
        --discovery-token-ca-cert-hash sha256:6e6990afc6b65416d7b1a2b00c154798dc38c1c611aa02b93754f35870dafae4
7.join control-plane nodes and join worker nodes

master节点join还需要从master01拷贝证书文件
worker节点直接执行命令就可以

scp -r  /etc/kubernetes/pki centos@k8s-master-2:/home/centos/
ssh centos@k8s-master-2 "sudo cp -ra /home/centos/pki  /etc/kubernetes/"

master2执行: 
  cd /etc/kubernetes/pki;ls | grep api | xargs rm -f
  cd /etc/kubernetes/pki/etcd;ls | grep -v ca. | xargs rm -f
#然后执行上面init成功显示的命令
8.部署网络插件
kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
#如果pod发现起不来,查看日志发现有问题,宿主机flannel网卡和cni网卡网段不一致可以删掉网卡重建
sudo ifconfig cni0 down    
sudo ip link delete cni0
9.验证部署状态
kubectl get cs
kubectl get pod -A
#部署个pod和service验证一下访问
[root@k8s-master-1 ~]# cat centos.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
  name: centos
spec:
  replicas: 3
  selector:
    matchLabels:
      app: httpd
  template:
    metadata:
      creationTimestamp: null
      labels:
        app: httpd
    spec:
      containers:
        - resources: {}
          terminationMessagePath: /dev/termination-log
          name: httpd
#          command:
#            - /bin/sh
          ports:
            - containerPort: 8080
              protocol: TCP
          imagePullPolicy: IfNotPresent
          terminationMessagePolicy: File
          image: >-
            pww.artifactory.cdi.philips.com/tools/nginx:latest
#          args:
#            - '-c'
#            - while true;do echo hello;sleep 100;done
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
      dnsPolicy: ClusterFirst
      securityContext: {}
      schedulerName: default-scheduler
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 25%
      maxSurge: 25%
  revisionHistoryLimit: 10
  progressDeadlineSeconds: 600
---
apiVersion: v1
kind: Service
metadata:
  name: centos
spec:
  selector:
    app: httpd
  type: NodePort
  ports:
  - name: http
    port: 80
    targetPort: 80
#    nodePort: 30036
    protocol: TCP


尝试访问所有主机的nodeport,如果访问不通,
nc -vz ip:port 发现timeout,排查掉安全组和其他策略后,
master01 ip add 发现

[root@k8s-master-1 ~]# ip add
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1442 qdisc pfifo_fast state UP group default qlen 1000
    link/ether fa:16:3e:59:a4:17 brd ff:ff:ff:ff:ff:ff
    inet 192.168.20.127/24 brd 192.168.20.255 scope global dynamic eth0
       valid_lft 37859sec preferred_lft 37859sec
    inet 192.168.20.150/24 scope global secondary eth0
       valid_lft forever preferred_lft forever
    inet6 fe80::f816:3eff:fe59:a417/64 scope link
       valid_lft forever preferred_lft forever
3: flannel.1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1392 qdisc noqueue state UNKNOWN group default
    link/ether ee:9b:21:8e:1f:0b brd ff:ff:ff:ff:ff:ff
    inet 10.244.0.0/32 scope global flannel.1
       valid_lft forever preferred_lft forever
    inet6 fe80::ec9b:21ff:fe8e:1f0b/64 scope link
       valid_lft forever preferred_lft forever
4: cni0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1392 qdisc noqueue state UP group default qlen 1000
    link/ether 9e:78:14:ec:5c:79 brd ff:ff:ff:ff:ff:ff
    inet 10.244.0.1/24 brd 10.244.0.255 scope global cni0
       valid_lft forever preferred_lft forever
    inet6 fe80::9c78:14ff:feec:5c79/64 scope link
       valid_lft forever preferred_lft forever
5: veth2fd0488d@if2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1392 qdisc noqueue master cni0 state UP group default
    link/ether e6:f5:1d:c1:08:9f brd ff:ff:ff:ff:ff:ff link-netnsid 0
    inet6 fe80::e4f5:1dff:fec1:89f/64 scope link
       valid_lft forever preferred_lft forever
6: vethb48239f1@if2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1392 qdisc noqueue master cni0 state UP group default
    link/ether 3e:89:6a:bb:c7:65 brd ff:ff:ff:ff:ff:ff link-netnsid 1
    inet6 fe80::3c89:6aff:febb:c765/64 scope link
       valid_lft forever preferred_lft forever

master02 ip add 发现

1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1442 qdisc pfifo_fast state UP group default qlen 1000
    link/ether fa:16:3e:28:ca:4d brd ff:ff:ff:ff:ff:ff
    inet 192.168.20.32/24 brd 192.168.20.255 scope global dynamic eth0
       valid_lft 42890sec preferred_lft 42890sec
    inet6 fe80::f816:3eff:fe28:ca4d/64 scope link
       valid_lft forever preferred_lft forever
3: flannel.1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1392 qdisc noqueue state UNKNOWN group default
    link/ether ba:06:d0:c4:f9:b0 brd ff:ff:ff:ff:ff:ff
    inet 10.244.1.0/32 scope global flannel.1
       valid_lft forever preferred_lft forever
    inet6 fe80::b806:d0ff:fec4:f9b0/64 scope link
       valid_lft forever preferred_lft forever

现在问题为没有cni网卡如何解决:
我想到了两种恢复手段。第一,直接将节点踢出集群,然后重新加入。第二,尝试手工恢复cni0虚拟网卡,将pod网络接回来。第一种简单粗暴些为了避免出现修改影响到master组件。我倾向于尝试第二种方式。

第一种方案:

#node节点执行:
kubeadm reset
rm -rf /etc/kubernetes/admin.conf 
rm -rf $HOME/.kube/config
#master操作
kubectl delete node xxx
#node 节点删除cni和flannel网卡(node操作)
kubeadm reset
 
ifconfig cni0 down
ifconfig flannel.1 down
ifconfig del flannel.1
ifconfig del cni0
 
ip link del flannel.1
ip link del cni0
 
# 命令执行过程中可能会有报错,有的网卡不存在则忽视
# 后面重新加入后会生成的

master操作
加入节点准备工作

# 通过 kubedam 重新生成 token
[root@master ~]# kubeadm token create --print-join-command
~~
kubeadm join 192.168.247.136:6443 --token x5phh9.9lpb629032p7dseb     --discovery-token-ca-cert-hash sha256:bd23534d635b46f5316f0d388bd88853a6ddb47b1c04129bf25ea31cdbbfba4a 
 
node节点重新加入
slave 执行join命令
###未验证

第二种方案:

如下是pod进出网络流量大致流程:

pod中产生数据,根据pod的路由信息,将数据发送到cni0;
cni0 根据node节点的路由表,将数据发送到隧道设备flannel.1;
flannel.1查看数据包的目的ip,从flanneld获得对端隧道设备的必要信息,封装数据包;
flannel.1将数据包发送到对端设备。对端节点的网卡接收到数据包,发现数据包为overlay数据包,解开外层封装,并发送内层封装到flannel.1设备;
数据达到node节点的flannel.1设备查看数据包,根据路由表匹配,将数据发送给cni0设备;
cni0匹配路由表,发送数据给网桥上对应的端口。
从通信过程可以知道,pod的网络需要连接到cni0网桥,而cni0和flannel.1网桥之间是没有连接的,通过node节点的路由表来实现转发通信的。所以,这里只需要将node节点所有的pod的网络虚拟对(veth pair)找到,然后将其中一端连接到重新创建的cni0虚拟网桥应该就可以了。变成了如下两个问题:

第一:如何创建cni0网桥并配置正确对应的参数?

由于flannel使用的是vxlan模式,所以创建cni0网桥的时候需要注意mtu值的设置。如下,创建cni0网桥:

// 创建cni0设备,指定类型为网桥
# ip link add cni0 type bridge
# ip link set dev cni0 up
// 为cni0设置ip地址,这个地址是pod的网关地址,需要和flannel.1对应网段
# ifconfig cni0 172.28.0.1/25
// 为cni0设置mtu为1450
# ifconfig cni0 mtu 1450 up


// 查看创建情况
# ifconfig cni0
cni0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1450
        inet 172.28.0.1  netmask 255.255.255.128  broadcast 172.28.0.127
        ether 0e:5e:b9:62:0d:60  txqueuelen 1000  (Ethernet)
        RX packets 487334  bytes 149990594 (149.9 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 629306  bytes 925100055 (925.1 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
// 此时查看路由表,也已经有了去往本机pod网段的cni0信息
# route -n | grep cni0
172.28.0.0      0.0.0.0         255.255.255.128 U     0      0        0 cni0

第二:如何准确找出每个pod的对应的veth pair虚拟对,是否和名称空间有关系?
在一台只有两个pod的节点上查看虚拟网卡的情况,发现在node主机上可以看到两个veth前缀的虚拟网卡,它们的另一端在pod中,并且pod的netns也可以通过show命令看到。值得注意的是,在node节点同网络名称空间下的vethf0978d0c和vethb9525687的连接设置master为cni0。所以,找到所有的veth前缀的虚拟网卡,并将其挂载到cni0上即可。

// 这里通过一个简单的脚本批量将veth的虚拟网卡挂载到cni0网桥上
for veth in $(ip addr | grep veth | grep -v master | awk -F'[@|:]' '{print $2}' | sed 's/ //g')
do
 ip link set dev $veth master cni0
done

通过以上两步操作后,失联的pod已经可以ping通,并和其它节点的pod正常通信了。测试新建删除pod都是正常的。
至此,手工恢复cni0算是完成了。如果还有下次 ip link del cni0的操作,可以不做节点排水操作。通过这个方式以最快,影响最小的方式恢复pod网络通信。这里也暴露了一个问题,那就是master节点的操作规范问题,对于生产环境,我们应该尽可能避免直接登录到master节点上进行操作,应该在其它管理机上授权k8s权限去操作。然后master节点尽量和业务分开独立部署,以保证master节点的稳定性。

Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐