一、k8s集群环境搭建

1.1节点规划

1.1.1 规划ip

1.1.2 修改主机名和hosts文件

1.2置密钥同步

ssh-keygen
apt install sshpass -y

#!/bin/bash
#目标主机列表
IP="
192.168.1.200
192.168.1.201
192.168.1.202
192.168.1.205
192.168.1.206
192.168.1.207
"
for node in ${IP};do
  sshpass -p 123456 ssh-copy-id ${node}  -o StrictHostKeyChecking=no
    echo "${node} 密钥同步完成"
  ssh ${node} ln -sv /usr/bin/python3 /usr/bin/python
    echo "${node} /usr/bin/pyton3 软连接创建完成"
done

1.3下载kubeasz项目及组件

部署节点
apt install ansible git -y

wget https://github.com/easzlab/kubeasz/releases/download/3.3.1/ezdown
chmod +x ezdown
./ezdown -D 下载

root@ubuntu20:~# ll /etc/kubeasz/
total 108
drwxrwxr-x  12 root root   224 Apr  1 14:17 ./
drwxr-xr-x 107 root root  8192 Apr  1 14:17 ../
drwxrwxr-x   3 root root    23 Jul  3  2022 .github/
-rw-rw-r--   1 root root   301 Jul  3  2022 .gitignore
-rw-rw-r--   1 root root  5058 Jul  3  2022 README.md
-rw-rw-r--   1 root root 20304 Jul  3  2022 ansible.cfg
drwxr-xr-x   3 root root  4096 Apr  1 14:17 bin/
drwxrwxr-x   8 root root    94 Jul  3  2022 docs/
drwxr-xr-x   2 root root   230 Apr  1 14:27 down/
drwxrwxr-x   2 root root    70 Jul  3  2022 example/
-rwxrwxr-x   1 root root 25012 Jul  3  2022 ezctl*
-rwxrwxr-x   1 root root 25266 Jul  3  2022 ezdown*
drwxrwxr-x  10 root root   145 Jul  3  2022 manifests/
drwxrwxr-x   2 root root   322 Jul  3  2022 pics/
drwxrwxr-x   2 root root  4096 Jul  3  2022 playbooks/
drwxrwxr-x  22 root root   323 Jul  3  2022 roles/
drwxrwxr-x   2 root root    48 Jul  3  2022 tools/

1.4自定义host文件

cd /etc/kubeasz
./ezctl --help
./ezctl new k8s-cluster1 创建集群

修改配置文件,⽣产并⾃定义hosts⽂件
vim /etc/kubeasz/clusters/k8s-cluster1/hosts

[etcd]
192.168.1.200
192.168.1.201

# master node(s)
[kube_master]
192.168.1.202
192.168.1.205

# work node(s)
[kube_node]
192.168.1.206
192.168.1.207

编辑config.yml⽂件:
cat /etc/kubeasz/clusters/k8s-cluster1/config.yml

root@ubuntu20:/etc/kubeasz/bin# cat /etc/kubeasz/clusters/k8s-cluster1/config.yml 
############################
# prepare
############################
# 可选离线安装系统软件包 (offline|online)
INSTALL_SOURCE: "online"

# 可选进行系统安全加固 github.com/dev-sec/ansible-collection-hardening
OS_HARDEN: false


############################
# role:deploy
############################
# default: ca will expire in 100 years
# default: certs issued by the ca will expire in 50 years
CA_EXPIRY: "876000h"
CERT_EXPIRY: "438000h"

# kubeconfig 配置参数
CLUSTER_NAME: "cluster1"
CONTEXT_NAME: "context-{{ CLUSTER_NAME }}"

# k8s version
K8S_VER: "1.24.2"

############################
# role:etcd
############################
# 设置不同的wal目录,可以避免磁盘io竞争,提高性能
ETCD_DATA_DIR: "/var/lib/etcd"
ETCD_WAL_DIR: ""


############################
# role:runtime [containerd,docker]
############################
# ------------------------------------------- containerd
# [.]启用容器仓库镜像
ENABLE_MIRROR_REGISTRY: true

# [containerd]基础容器镜像
SANDBOX_IMAGE: "harbor.luohw.net/baseimage/pause:3.7"

# [containerd]容器持久化存储目录
CONTAINERD_STORAGE_DIR: "/var/lib/containerd"

# ------------------------------------------- docker
# [docker]容器存储目录
DOCKER_STORAGE_DIR: "/var/lib/docker"

# [docker]开启Restful API
ENABLE_REMOTE_API: false

# [docker]信任的HTTP仓库
INSECURE_REG: '["http://easzlab.io.local:5000","harbor.luohw.net"]'


############################
# role:kube-master
############################
# k8s 集群 master 节点证书配置,可以添加多个ip和域名(比如增加公网ip和域名)
MASTER_CERT_HOSTS:
  - "192.168.1.188"
  - "192.168.1.189"
  - "192.168.1.190"
  - "192.168.1.191"
  - "api.luohw.net"
  - "k8s.easzlab.io"
  
  #- "www.test.com"

# node 节点上 pod 网段掩码长度(决定每个节点最多能分配的pod ip地址)
# 如果flannel 使用 --kube-subnet-mgr 参数,那么它将读取该设置为每个节点分配pod网段
# https://github.com/coreos/flannel/issues/847
NODE_CIDR_LEN: 24


############################
# role:kube-node
############################
# Kubelet 根目录
KUBELET_ROOT_DIR: "/var/lib/kubelet"

# node节点最大pod 数
MAX_PODS: 500

# 配置为kube组件(kubelet,kube-proxy,dockerd等)预留的资源量
# 数值设置详见templates/kubelet-config.yaml.j2
KUBE_RESERVED_ENABLED: "no"

# k8s 官方不建议草率开启 system-reserved, 除非你基于长期监控,了解系统的资源占用状况;
# 并且随着系统运行时间,需要适当增加资源预留,数值设置详见templates/kubelet-config.yaml.j2
# 系统预留设置基于 4c/8g 虚机,最小化安装系统服务,如果使用高性能物理机可以适当增加预留
# 另外,集群安装时候apiserver等资源占用会短时较大,建议至少预留1g内存
SYS_RESERVED_ENABLED: "no"


############################
# role:network [flannel,calico,cilium,kube-ovn,kube-router]
############################
# ------------------------------------------- flannel
# [flannel]设置flannel 后端"host-gw","vxlan"等
FLANNEL_BACKEND: "vxlan"
DIRECT_ROUTING: false

# [flannel] flanneld_image: "quay.io/coreos/flannel:v0.10.0-amd64"
flannelVer: "v0.15.1"
flanneld_image: "easzlab.io.local:5000/easzlab/flannel:{{ flannelVer }}"

# ------------------------------------------- calico
# [calico]设置 CALICO_IPV4POOL_IPIP=“off”,可以提高网络性能,条件限制详见 docs/setup/calico.md
CALICO_IPV4POOL_IPIP: "Always"

# [calico]设置 calico-node使用的host IP,bgp邻居通过该地址建立,可手工指定也可以自动发现
IP_AUTODETECTION_METHOD: "can-reach={{ groups['kube_master'][0] }}"

# [calico]设置calico 网络 backend: brid, vxlan, none
CALICO_NETWORKING_BACKEND: "brid"

# [calico]设置calico 是否使用route reflectors
# 如果集群规模超过50个节点,建议启用该特性
CALICO_RR_ENABLED: false

# CALICO_RR_NODES 配置route reflectors的节点,如果未设置默认使用集群master节点 
# CALICO_RR_NODES: ["192.168.1.1", "192.168.1.2"]
CALICO_RR_NODES: []

# [calico]更新支持calico 版本: [v3.3.x] [v3.4.x] [v3.8.x] [v3.15.x]
calico_ver: "v3.19.4"

# [calico]calico 主版本
calico_ver_main: "{{ calico_ver.split('.')[0] }}.{{ calico_ver.split('.')[1] }}"

# ------------------------------------------- cilium
# [cilium]镜像版本
cilium_ver: "1.11.6"
cilium_connectivity_check: true
cilium_hubble_enabled: false
cilium_hubble_ui_enabled: false

# ------------------------------------------- kube-ovn
# [kube-ovn]选择 OVN DB and OVN Control Plane 节点,默认为第一个master节点
OVN_DB_NODE: "{{ groups['kube_master'][0] }}"

# [kube-ovn]离线镜像tar包
kube_ovn_ver: "v1.5.3"

# ------------------------------------------- kube-router
# [kube-router]公有云上存在限制,一般需要始终开启 ipinip;自有环境可以设置为 "subnet"
OVERLAY_TYPE: "full"

# [kube-router]NetworkPolicy 支持开关
FIREWALL_ENABLE: true

# [kube-router]kube-router 镜像版本
kube_router_ver: "v0.3.1"
busybox_ver: "1.28.4"


############################
# role:cluster-addon
############################
# coredns 自动安装
dns_install: "no"
corednsVer: "1.9.3"
ENABLE_LOCAL_DNS_CACHE: false
dnsNodeCacheVer: "1.21.1" 
# 设置 local dns cache 地址
LOCAL_DNS_CACHE: "169.254.20.10"

# metric server 自动安装
metricsserver_install: "no"
metricsVer: "v0.5.2"

# dashboard 自动安装
dashboard_install: "no"
dashboardVer: "v2.5.1"
dashboardMetricsScraperVer: "v1.0.8"

# prometheus 自动安装
prom_install: "no"
prom_namespace: "monitor"
prom_chart_ver: "35.5.1"

# nfs-provisioner 自动安装
nfs_provisioner_install: "no"
nfs_provisioner_namespace: "kube-system"
nfs_provisioner_ver: "v4.0.2"
nfs_storage_class: "managed-nfs-storage"
nfs_server: "192.168.1.10"
nfs_path: "/data/nfs"

# network-check 自动安装
network_check_enabled: false 
network_check_schedule: "*/5 * * * *"

############################
# role:harbor
############################
# harbor version,完整版本号
HARBOR_VER: "v2.1.3"
HARBOR_DOMAIN: "harbor.easzlab.io.local"
HARBOR_TLS_PORT: 8443

# if set 'false', you need to put certs named harbor.pem and harbor-key.pem in directory 'down'
HARBOR_SELF_SIGNED_CERT: true

# install extra component
HARBOR_WITH_NOTARY: false
HARBOR_WITH_TRIVY: false
HARBOR_WITH_CLAIR: false
HARBOR_WITH_CHARTMUSEUM: true

管理节点上传pause到harbor镜像仓库

mkdir  /etc/docker/certs.d/harbor.luohw.net 
scp /apps/harbor/certs/luohw.com.crt 192.168.1.200:/etc/docker/certs.d/harbor.luohw.net/ 
docker login harbor.luohw.net

docker tag 221177c6082a   harbor.luohw.net/baseimage/pause:3.7

docker push harbor.luohw.net/baseimage/pause:3.7
所用node配置harbor的域名解析
echo   "192.168.1.205 harbor.luohw.net " >> /etc/hosts

执行初始化

./ezctl setup k8s-cluster1 01 #准备CA和基础系统设置

部署etcd集群

./ezctl setup k8s-cluster1 02 #部署etcd集群

各etcd服务器验证etcd服务:

root@ubuntu20:~# export NODE_IPS="192.168.1.200 192.168.1.201 192.168.1.202"
root@ubuntu20:~# 
root@ubuntu20:~#  for ip in ${NODE_IPS}; do   ETCDCTL_API=3 /usr/local/bin/etcdctl   --endpoints=https://${ip}:2379    --cacert=/etc/kubernetes/ssl/ca.pem   --cert=/etc/kubernetes/ssl/etcd.pem   --key=/etc/kubernetes/ssl/etcd-key.pem   endpoint health; done
https://192.168.1.200:2379 is healthy: successfully committed proposal: took = 20.090962ms
https://192.168.1.201:2379 is healthy: successfully committed proposal: took = 17.58472ms

部署容器运行时

首先修改一下containerd的模板配置文件,让containerd能够访问自建的harbo,添加私有仓库配置


vi /etc/containerd/config.toml 
     
        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."harbor.luohw.net"]
          endpoint = ["https://harbor.luohw.net"]
        [plugins."io.containerd.grpc.v1.cri".registry.configs."harbor.luohw.net".tls]
          insecure_skip_verify = true
        [plugins."io.containerd.grpc.v1.cri".registry.configs."harbor.luohw.net".auth]
          username = "admin"
          password = "123456"

systemctl restart containerd
模板添加,方便后续节点也是最新配置
157行添加
root@ubuntu20:/etc/kubeasz# pwd
/etc/kubeasz
root@ubuntu20:/etc/kubeasz# vim roles/containerd/templates/config.toml.j2

./ezctl setup k8s-cluster1 03

问题

root@ubuntu20:~# crictl  pull  harbor.luohw.net/baseimage/paus:3.7
E0402 14:44:08.277290  558467 remote_image.go:238] "PullImage from image service failed" err="rpc error: code = Unknown desc = failed to pull and unpack image \"harbor.luohw.net/baseimage/paus:3.7\": failed to resolve reference \"harbor.luohw.net/baseimage/paus:3.7\": failed to do request: Head \"https://harbor.luohw.net/v2/baseimage/paus/manifests/3.7\": x509: certificate signed by unknown authority" image="harbor.luohw.net/baseimage/paus:3.7"
FATA[0000] pulling image: rpc error: code = Unknown desc = failed to pull and unpack image "harbor.luohw.net/baseimage/paus:3.7": failed to resolve reference "harbor.luohw.net/baseimage/paus:3.7": failed to do request: Head "https://harbor.luohw.net/v2/baseimage/paus/manifests/3.7": x509: certificate signed by unknown authoriy
注意 /etc/containerd/config.toml 文件的正确性

部署master节点

./ezctl setup k8s-cluster1 04

部署node

./ezctl setup k8s-cluster1 05

部署网络插件

  1. 管理节点修改tag并上传镜像到仓库

docker push harbor.luohw.net/baseimage/calico-pod2daemon-flexvol:v3.19.4
docker push harbor.luohw.net/baseimage/calico-cni:v3.19.
docker push harbor.luohw.net/baseimage/calico-kube-controllers:v3.19.4
docker push harbor.luohw.net/baseimage/calico-node:v3.19.4

并修改为配置文件私有仓库

root@ubuntu20:~# cat /etc/kubeasz/roles/calico/templates/calico-v3.19.yaml.j2 |grep -n image
213:          image: harbor.luohw.net/baseimage/calico-cni:{{ calico_ver }}
257:          image: harbor.luohw.net/baseimage/calico-pod2daemon-flexvol:{{ calico_ver }}
268:          image: harbor.luohw.net/baseimage/calico-node:{{ calico_ver }}
517:          image: harbor.luohw.net/baseimage/calico-kube-controllers:{{ calico_ver }}

./ezctl setup k8s-cluster1 06

添加kubectl命令补全
cat .bashrc
source <(kubectl completion bash) # generated by kubeasz

部署coredns

wget https://raw.githubusercontent.com/coredns/deployment/master/kubernetes/coredns.yaml.sed
docker tag coredns/coredns:1.9.4  harbor.luohw.net/baseimage/coredns:1.9.4
root@k8s-harbor:/apps/harbor# docker push  harbor.luohw.net/baseimage/coredns:1.9.4
root@k8s-harbor:/apps/harbor# docker tag coredns/coredns:1.9.4  harbor.luohw.net/baseimages/coredns:1.9.4
root@k8s-harbor:/apps/harbor# docker push  harbor.luohw.net/baseimages/coredns:1.9.4
# __MACHINE_GENERATED_WARNING__

apiVersion: v1
kind: ServiceAccount
metadata:
  name: coredns
  namespace: kube-system
  labels:
      kubernetes.io/cluster-service: "true"
      addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    kubernetes.io/bootstrapping: rbac-defaults
    addonmanager.kubernetes.io/mode: Reconcile
  name: system:coredns
rules:
- apiGroups:
  - ""
  resources:
  - endpoints
  - services
  - pods
  - namespaces
  verbs:
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - nodes
  verbs:
  - get
- apiGroups:
  - discovery.k8s.io
  resources:
  - endpointslices
  verbs:
  - list
  - watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  annotations:
    rbac.authorization.kubernetes.io/autoupdate: "true"
  labels:
    kubernetes.io/bootstrapping: rbac-defaults
    addonmanager.kubernetes.io/mode: EnsureExists
  name: system:coredns
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:coredns
subjects:
- kind: ServiceAccount
  name: coredns
  namespace: kube-system
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: coredns
  namespace: kube-system
  labels:
      addonmanager.kubernetes.io/mode: EnsureExists
data:
  Corefile: |
    .:53 {
        errors
        health {
            lameduck 5s
        }
        log   {
        }
        ready
        kubernetes  cluster.local   in-addr.arpa ip6.arpa {
            pods insecure
            fallthrough in-addr.arpa ip6.arpa
            ttl 30
        }
        prometheus :9153
        forward . /etc/resolv.conf {
            max_concurrent 1000
        }
        cache 30
        loop
        reload
        loadbalance
    }
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: coredns
  namespace: kube-system
  labels:
    k8s-app: kube-dns
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
    kubernetes.io/name: "CoreDNS"
spec:
  # replicas: not specified here:
  # 1. In order to make Addon Manager do not reconcile this replicas parameter.
  # 2. Default is 1.
  # 3. Will be tuned in real time if DNS horizontal auto-scaling is turned on.
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
  selector:
    matchLabels:
      k8s-app: kube-dns
  template:
    metadata:
      labels:
        k8s-app: kube-dns
    spec:
      securityContext:
        seccompProfile:
          type: RuntimeDefault
      priorityClassName: system-cluster-critical
      serviceAccountName: coredns
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchExpressions:
                  - key: k8s-app
                    operator: In
                    values: ["kube-dns"]
              topologyKey: kubernetes.io/hostname
      tolerations:
        - key: "CriticalAddonsOnly"
          operator: "Exists"
      nodeSelector:
        kubernetes.io/os: linux
      containers:
      - name: coredns
        image: harbor.luohw.net/baseimages/coredns:1.9.3
        imagePullPolicy: IfNotPresent
        resources:
          limits:
            memory: 256Mi
            cpu: 200m
          requests:
            cpu: 100m
            memory: 70Mi
        args: [ "-conf", "/etc/coredns/Corefile" ]
        volumeMounts:
        - name: config-volume
          mountPath: /etc/coredns
          readOnly: true
        ports:
        - containerPort: 53
          name: dns
          protocol: UDP
        - containerPort: 53
          name: dns-tcp
          protocol: TCP
        - containerPort: 9153
          name: metrics
          protocol: TCP
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
            scheme: HTTP
          initialDelaySeconds: 60
          timeoutSeconds: 5
          successThreshold: 1
          failureThreshold: 5
        readinessProbe:
          httpGet:
            path: /ready
            port: 8181
            scheme: HTTP
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            add:
            - NET_BIND_SERVICE
            drop:
            - all
          readOnlyRootFilesystem: true
      dnsPolicy: Default
      volumes:
        - name: config-volume
          configMap:
            name: coredns
            items:
            - key: Corefile
              path: Corefile
---
apiVersion: v1
kind: Service
metadata:
  name: kube-dns
  namespace: kube-system
  annotations:
    prometheus.io/port: "9153"
    prometheus.io/scrape: "true"
  labels:
    k8s-app: kube-dns
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
    kubernetes.io/name: "CoreDNS"
spec:
  selector:
    k8s-app: kube-dns
  clusterIP: 10.100.0.2
  ports:
  - name: dns
    port: 53
    protocol: UDP
  - name: dns-tcp
    port: 53
    protocol: TCP
  - name: metrics
    port: 9153
    protocol: TCP

二 、 集群维护

添加master节点 ./ezctl add-master 192.168.1.30

集群升级: 解决bug

github下载新版二进制包
解压

升级master:

node节点注释要升级的master
重新加载 systemctl reload kube-lb
停止scheduler apiserver apiserver controller-manager kube-proxy服务
升级:master替换二进制文件, 在/usr/local/bin
kube-apiserver kube-proxy kubectl kube-controller-manager kube-scheduler kubelet
取消注释,重新加载kube-lb
systemctl reload kube-lb

升级node

驱逐pod kubectl drain 192.168.1.30 --ignore-daemonsets
驱逐完成后,停用kubelet和kube-proxy
替换二进制
重启kubelet和kube-proxy
验证版本
取消禁止调度
kubectl uncordon 192.168.122.20

Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐