kubeadmin 部署K8S calico组件POD报错排查记录
calico ,k8s,kubeadmin
·
双网卡
eth0 动态获取
eth1 ssh专用兼K8S部署IP
首节点.240机器发生重启,产生异常
[root@k8s-master01 ~]# kubectl get po -A -owide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-system calico-kube-controllers-55d57695c9-8gdjm 0/1 ContainerCreating 0 28s <none> node01 <none> <none>
kube-system calico-kube-controllers-55d57695c9-jnvsg 0/1 Terminating 0 4m55s <none> node02 <none> <none>
kube-system calico-kube-controllers-7bdbfc669-6l6n8 0/1 Terminating 445 26d <none> node01 <none> <none>
kube-system calico-kube-controllers-7bdbfc669-l9n4m 0/1 Terminating 0 25m <none> node01 <none> <none>
kube-system calico-node-4rlch 0/1 Running 0 28s 192.168.40.243 node01 <none> <none>
kube-system calico-node-8qvpr 0/1 Init:0/3 0 29s 192.168.40.240 k8s-master01 <none> <none>
kube-system calico-node-hl7b9 0/1 Running 0 28s 192.168.40.242 k8s-master03 <none> <none>
kube-system calico-node-hz22q 0/1 Running 0 28s 192.168.40.241 k8s-master02 <none> <none>
kube-system calico-node-vtqr9 0/1 Running 0 28s 192.168.40.244 node02 <none> <none>
kube-system calico-typha-6f7c754fc5-bwgp9 0/1 Pending 0 28s <none> <none> <none> <none>
kube-system calico-typha-6f7c754fc5-lz87n 1/1 Running 0 28s 192.168.40.244 node02 <none> <none>
kube-system calico-typha-6f7c754fc5-t8k22 1/1 Running 0 29s 192.168.40.243 node01 <none> <none>
kube-system coredns-567c556887-2k5gm 0/1 Unknown 1 26d <none> node01 <none> <none>
kube-system coredns-567c556887-gqxc2 1/1 Running 2 (32h ago) 15d 172.18.195.4 k8s-master03 <none> <none>
kube-system etcd-k8s-master01 1/1 Running 86 (109m ago) 30d 192.168.40.240 k8s-master01 <none> <none>
kube-system etcd-k8s-master02 1/1 Running 14 25h 192.168.40.241 k8s-master02 <none> <none>
kube-system etcd-k8s-master03 1/1 Running 3 25h 192.168.40.242 k8s-master03 <none> <none>
kube-system kube-apiserver-k8s-master01 1/1 Running 1759 (109m ago) 30d 192.168.40.240 k8s-master01 <none> <none>
kube-system kube-apiserver-k8s-master02 1/1 Running 99 (25h ago) 30d 192.168.40.241 k8s-master02 <none> <none>
kube-system kube-apiserver-k8s-master03 1/1 Running 95 (25h ago) 30d 192.168.40.242 k8s-master03 <none> <none>
kube-system kube-controller-manager-k8s-master01 1/1 Running 30 (109m ago) 30d 192.168.40.240 k8s-master01 <none> <none>
kube-system kube-controller-manager-k8s-master02 1/1 Running 28 30d 192.168.40.241 k8s-master02 <none> <none>
kube-system kube-controller-manager-k8s-master03 1/1 Running 29 (32h ago) 30d 192.168.40.242 k8s-master03 <none> <none>
kube-system kube-proxy-dg7w5 1/1 Running 0 18m 192.168.40.240 k8s-master01 <none> <none>
kube-system kube-proxy-h4cpx 1/1 Running 3 (25h ago) 30d 192.168.40.243 node01 <none> <none>
kube-system kube-proxy-ngrst 1/1 Running 3 (25h ago) 30d 192.168.40.244 node02 <none> <none>
kube-system kube-proxy-qn6lx 1/1 Running 3 (32h ago) 30d 192.168.40.242 k8s-master03 <none> <none>
kube-system kube-proxy-v49jf 1/1 Running 3 (32h ago) 30d 192.168.40.241 k8s-master02 <none> <none>
kube-system kube-scheduler-k8s-master01 1/1 Running 28 (109m ago) 30d 192.168.40.240 k8s-master01 <none> <none>
kube-system kube-scheduler-k8s-master02 1/1 Running 25 (25h ago) 30d 192.168.40.241 k8s-master02 <none> <none>
kube-system kube-scheduler-k8s-master03 1/1 Running 28 (32h ago) 30d 192.168.40.242 k8s-master03 <none> <none>
kube-system kuboard-cc79974cd-t9jth 0/1 Unknown 3 19d <none> k8s-master01 <none> <none>
kube-system metrics-server-5fdfc8fc4b-8bdgq 0/1 ContainerCreating 0 8h <none> k8s-master01 <none> <none>
kube-system metrics-server-7fb6684448-7jhnd 0/1 Unknown 0 8d <none> node02 <none> <none>
kubernetes-dashboard dashboard-metrics-scraper-7bc864c59-cff7g 0/1 Unknown 2 30d <none> node02 <none> <none>
kubernetes-dashboard kubernetes-dashboard-7d545d54dc-bkz2x 0/1 Unknown 1763 30d <none> node01 <none> <none>
kuboard metrics-scraper-7f4896c5d7-6w6ld 0/1 Unknown 3 18d <none> k8s-master01 <none> <none>
kuboard metrics-scraper-7f4896c5d7-k8bs5 0/1 ContainerCreating 0 68m <none> node02 <none> <none>
[root@k8s-master01 ~]#
kubelet 日志报failed to "KillPodSandbox
(屏显有限未复制全)
[root@k8s-master01 k8s-ha-install]# journalctl -xeu kubelet
2月 17 18:18:13 k8s-master01 kubelet[4326]: I0217 18:18:13.875358 4326 scope.go:115] "RemoveContainer" containerID="5dfacb294adf9b1b99a96029eeeec8001a43bbea2b31132c5225e819aba
2月 17 18:18:13 k8s-master01 kubelet[4326]: E0217 18:18:13.876313 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\"
2月 17 18:18:20 k8s-master01 kubelet[4326]: E0217 18:18:20.991222 4326 remote_runtime.go:205] "StopPodSandbox from runtime service failed" err="rpc error: code = Unknown desc
2月 17 18:18:20 k8s-master01 kubelet[4326]: E0217 18:18:20.991256 4326 kuberuntime_gc.go:177] "Failed to stop sandbox before removing" err="rpc error: code = Unknown desc = fa
2月 17 18:18:22 k8s-master01 kubelet[4326]: E0217 18:18:22.906904 4326 remote_runtime.go:205] "StopPodSandbox from runtime service failed" err="rpc error: code = Unknown desc
2月 17 18:18:22 k8s-master01 kubelet[4326]: E0217 18:18:22.906955 4326 kuberuntime_manager.go:965] "Failed to stop sandbox" podSandboxID={Type:containerd ID:b5d4c66be8eaceb760
2月 17 18:18:22 k8s-master01 kubelet[4326]: E0217 18:18:22.906985 4326 kuberuntime_manager.go:705] "killPodWithSyncResult failed" err="failed to \"KillPodSandbox\" for \"36593
2月 17 18:18:22 k8s-master01 kubelet[4326]: E0217 18:18:22.907004 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"KillPodSandbox\" for \"36593808-0a88-
2月 17 18:18:28 k8s-master01 kubelet[4326]: I0217 18:18:28.858450 4326 scope.go:115] "RemoveContainer" containerID="5dfacb294adf9b1b99a96029eeeec8001a43bbea2b31132c5225e819aba
2月 17 18:18:28 k8s-master01 kubelet[4326]: E0217 18:18:28.858752 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\"
2月 17 18:18:40 k8s-master01 kubelet[4326]: I0217 18:18:40.858763 4326 scope.go:115] "RemoveContainer" containerID="5dfacb294adf9b1b99a96029eeeec8001a43bbea2b31132c5225e819aba
2月 17 18:18:40 k8s-master01 kubelet[4326]: E0217 18:18:40.859120 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\"
2月 17 18:18:51 k8s-master01 kubelet[4326]: E0217 18:18:51.008332 4326 remote_runtime.go:205] "StopPodSandbox from runtime service failed" err="rpc error: code = Unknown desc
2月 17 18:18:51 k8s-master01 kubelet[4326]: E0217 18:18:51.008370 4326 kuberuntime_manager.go:965] "Failed to stop sandbox" podSandboxID={Type:containerd ID:de17dabf8ba261e0be
2月 17 18:18:54 k8s-master01 kubelet[4326]: I0217 18:18:54.858380 4326 scope.go:115] "RemoveContainer" containerID="5dfacb294adf9b1b99a96029eeeec8001a43bbea2b31132c5225e819aba
2月 17 18:18:54 k8s-master01 kubelet[4326]: E0217 18:18:54.858759 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\"
2月 17 18:19:07 k8s-master01 kubelet[4326]: E0217 18:19:07.884390 4326 remote_runtime.go:205] "StopPodSandbox from runtime service failed" err="rpc error: code = Unknown desc
2月 17 18:19:07 k8s-master01 kubelet[4326]: E0217 18:19:07.884443 4326 kuberuntime_manager.go:965] "Failed to stop sandbox" podSandboxID={Type:containerd ID:b5d4c66be8eaceb760
2月 17 18:19:07 k8s-master01 kubelet[4326]: E0217 18:19:07.884477 4326 kuberuntime_manager.go:705] "killPodWithSyncResult failed" err="failed to \"KillPodSandbox\" for \"36593
2月 17 18:19:07 k8s-master01 kubelet[4326]: E0217 18:19:07.884501 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"KillPodSandbox\" for \"36593808-0a88-
2月 17 18:19:09 k8s-master01 kubelet[4326]: I0217 18:19:09.858693 4326 scope.go:115] "RemoveContainer" containerID="5dfacb294adf9b1b99a96029eeeec8001a43bbea2b31132c5225e819aba
2月 17 18:19:09 k8s-master01 kubelet[4326]: E0217 18:19:09.859038 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\"
2月 17 18:19:20 k8s-master01 kubelet[4326]: E0217 18:19:20.999021 4326 remote_runtime.go:205] "StopPodSandbox from runtime service failed" err="rpc error: code = Unknown desc
2月 17 18:19:20 k8s-master01 kubelet[4326]: E0217 18:19:20.999054 4326 kuberuntime_gc.go:177] "Failed to stop sandbox before removing" err="rpc error: code = Unknown desc = fa
2月 17 18:19:21 k8s-master01 kubelet[4326]: E0217 18:19:21.047294 4326 remote_runtime.go:224] "RemovePodSandbox from runtime service failed" err="rpc error: code = Unknown des
2月 17 18:19:21 k8s-master01 kubelet[4326]: E0217 18:19:21.047325 4326 kuberuntime_gc.go:181] "Failed to remove sandbox" err="rpc error: code = Unknown desc = failed to forcib
2月 17 18:19:21 k8s-master01 kubelet[4326]: I0217 18:19:21.052879 4326 image_gc_manager.go:329] "Attempting to delete unused images"
2月 17 18:19:21 k8s-master01 kubelet[4326]: E0217 18:19:21.057753 4326 remote_runtime.go:205] "StopPodSandbox from runtime service failed" err="rpc error: code = Unknown desc
2月 17 18:19:21 k8s-master01 kubelet[4326]: E0217 18:19:21.057788 4326 kuberuntime_manager.go:965] "Failed to stop sandbox" podSandboxID={Type:containerd ID:7bee3baf06a62ca728
2月 17 18:19:21 k8s-master01 kubelet[4326]: E0217 18:19:21.057819 4326 kubelet.go:1899] failed to "KillPodSandbox" for "09170455-4c17-47f3-aabb-6b78ca8b3082" with KillPodSandb
2月 17 18:19:21 k8s-master01 kubelet[4326]: E0217 18:19:21.057835 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"KillPodSandbox\" for \"09170455-4c17-
2月 17 18:19:21 k8s-master01 kubelet[4326]: I0217 18:19:21.068009 4326 eviction_manager.go:356] "Eviction manager: must evict pod(s) to reclaim" resourceName="ephemeral-storag
2月 17 18:19:21 k8s-master01 kubelet[4326]: I0217 18:19:21.068099 4326 eviction_manager.go:374] "Eviction manager: pods ranked for eviction" pods="[kube-system/kuboard-cc79974
2月 17 18:19:24 k8s-master01 kubelet[4326]: I0217 18:19:24.857818 4326 scope.go:115] "RemoveContainer" containerID="5dfacb294adf9b1b99a96029eeeec8001a43bbea2b31132c5225e819aba
2月 17 18:19:24 k8s-master01 kubelet[4326]: E0217 18:19:24.858125 4326 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\"
2月 17 18:19:31 k8s-master01 kubelet[4326]: E0217 18:19:31.068990 4326 eviction_manager.go:593] "Eviction manager: pod failed to evict" err="timeout waiting to kill pod" pod="
2月 17 18:19:31 k8s-master01 kubelet[4326]: I0217 18:19:31.069037 4326 eviction_manager.go:204] "Eviction manager: pods evicted, waiting for pod to be cleaned up" pods="[kube-
[root@k8s-master01 k8s-ha-install]#
查po信息报
socket: dial unix /var/run/bird/bird.ctl: connect: no such file or directory
# kubectl describe po coredns-567c556887-2k5gm -n kube-system
......
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Pulled 26m kubelet Container image "docker.io/calico/cni:v3.24.5" already present on machine
Normal Created 26m kubelet Created container upgrade-ipam
Normal Started 26m kubelet Started container upgrade-ipam
Normal Scheduled 26m default-scheduler Successfully assigned kube-system/calico-node-dbqhp to k8s-master01
Normal Pulled 25m (x2 over 26m) kubelet Container image "docker.io/calico/cni:v3.24.5" already present on machine
Normal Created 25m (x2 over 26m) kubelet Created container install-cni
Normal Started 25m (x2 over 26m) kubelet Started container install-cni
Normal Pulling 25m kubelet Pulling image "docker.io/calico/node:v3.24.5"
Normal Pulled 25m kubelet Successfully pulled image "docker.io/calico/node:v3.24.5" in 14.3617197s (14.3617265s including waiting)
Normal Created 25m kubelet Created container mount-bpffs
Normal Started 25m kubelet Started container mount-bpffs
Normal Started 25m kubelet Started container calico-node
Warning Unhealthy 25m (x2 over 25m) kubelet Liveness probe failed: calico/node is not ready: Felix is not live: Get "http://localhost:9099/liveness": dial tcp [::1]:9099: connect: connection refused
Normal Pulled 25m (x2 over 25m) kubelet Container image "docker.io/calico/node:v3.24.5" already present on machine
Normal Created 25m (x2 over 25m) kubelet Created container calico-node
Warning Unhealthy 6m25s (x52 over 25m) kubelet Readiness probe failed: calico/node is not ready: BIRD is not ready: Error querying BIRD: unable to connect to BIRDv4 socket: dial unix /var/run/bird/bird.ctl: connect: no such file or directory
Warning BackOff 82s (x87 over 24m) kubelet Back-off restarting failed container calico-node in pod calico-node-dbqhp_kube-system(beea9742-7720-4da1-b18a-565035bb57c2)
查了一下午,
发现tunl0 不能正常分配IP
[root@k8s-master01 k8s-ha-install]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 00:15:5d:28:e9:04 brd ff:ff:ff:ff:ff:ff
inet 172.19.217.30/16 brd 172.19.255.255 scope global noprefixroute eth0
valid_lft forever preferred_lft forever
inet6 fe80::b772:4175:35fb:aa7c/64 scope link tentative noprefixroute dadfailed
valid_lft forever preferred_lft forever
inet6 fe80::af84:c4a:4123:f4d0/64 scope link tentative noprefixroute dadfailed
valid_lft forever preferred_lft forever
inet6 fe80::8d7f:abd9:eb13:158d/64 scope link noprefixroute
valid_lft forever preferred_lft forever
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 00:15:5d:28:e9:05 brd ff:ff:ff:ff:ff:ff
inet 192.168.40.240/24 brd 192.168.40.255 scope global noprefixroute dynamic eth1
valid_lft 34sec preferred_lft 34sec
inet 192.168.40.254/24 scope global secondary eth1
valid_lft forever preferred_lft forever
inet6 fe80::bfd2:a9ca:51b5:6112/64 scope link noprefixroute
valid_lft forever preferred_lft forever
4: tunl0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN group default qlen 1000
link/ipip 0.0.0.0 brd 0.0.0.0
5: virbr0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN group default qlen 1000
link/ether 52:54:00:c5:ba:c6 brd ff:ff:ff:ff:ff:ff
inet 192.168.122.1/24 brd 192.168.122.255 scope global virbr0
valid_lft forever preferred_lft forever
6: virbr0-nic: <BROADCAST,MULTICAST> mtu 1500 qdisc pfifo_fast master virbr0 state DOWN group default qlen 1000
link/ether 52:54:00:c5:ba:c6 brd ff:ff:ff:ff:ff:ff
7: docker0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default
link/ether 02:42:57:8f:40:da brd ff:ff:ff:ff:ff:ff
inet 172.17.0.1/16 brd 172.17.255.255 scope global docker0
valid_lft forever preferred_lft forever
inet6 fe80::42:57ff:fe8f:40da/64 scope link
valid_lft forever preferred_lft forever
9: vetha2e0742@if8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master docker0 state UP group default
link/ether 72:c7:77:02:20:02 brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet6 fe80::70c7:77ff:fe02:2002/64 scope link
valid_lft forever preferred_lft forever
正常情况calico会配置一个IP给POD提供外部通信
[root@node01 ~]# ip a show tunl0
4: tunl0@NONE: <NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
link/ipip 0.0.0.0 brd 0.0.0.0
inet 172.29.55.1/32 scope global tunl0
valid_lft forever preferred_lft forever
[root@node01 ~]#
kubectl delete -f calico.yaml
kubectl apply -f calico.yaml
反复重载calico,添加指定网卡
# Cluster type to identify the deployment type
- name: CLUSTER_TYPE
value: "k8s,bgp"
- name: IP_AUTODETECTION_METHOD ##添加字段
value: "interface=eth1" ##添加字段指定网卡
# Auto-detect the BGP IP address.
- name: IP
value: "autodetect"
# Enable IPIP
- name: CALICO_IPV4POOL_IPIP
value: "Always"
问题依旧。
回想是否是 eth0 IP变动,
发现eth0 被分配成其它段,异常节点不再与集群同一个子网。
虽然配置的eth1为集群地址,但首次部署 calico,未指定网卡,
造成后续再指定重载也无用。 将eth0 配置为 固定IP同一子网。
重载calico发现tunl0 成功分配IP,修复。
命令形式设置
kubectl set env ds/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=bond*,eth1
kubectl logs calico-node-4ltt6 -n kube-system #查看生效接口日志
calico不配置ip自动检测策略时,默认为first_found
https://blog.csdn.net/u013149714/article/details/127763279
https://docs.tigera.io/calico/3.25/reference/configure-calico-node#configuring-the-default-ip-pools
更多推荐
已为社区贡献1条内容
所有评论(0)