VMware死机导致etcd数据丢失,造成k8s无法启动

k8s api-server容器不断退出、重启

root@ubuntu:~# docker ps -a |grep kube-apiserver
726f7082259d        0eaa5e1d871a                                                            "kube-apiserver --ad…"   27 seconds ago      Exited (255) 7 seconds ago                                                                                                                                                                                            k8s_kube-apiserver_kube-apiserver-ubuntu_kube-system_19e46c52e07acaf9990e1b09ab162d3c_9
2db76b8b2d63        k8s.gcr.io/pause:3.1                                                    "/pause"                 28 seconds ago      Up 27 seconds                                                                                                                                                                                                         k8s_POD_kube-apiserver-ubuntu_kube-system_19e46c52e07acaf9990e1b09ab162d3c_5
4830897ad463        k8s.gcr.io/pause:3.1                                                    "/pause"                 28 minutes ago      Exited (0) 27 minutes ago                                                                                                                                                                                             k8s_POD_kube-apiserver-ubuntu_kube-system_19e46c52e07acaf9990e1b09ab162d3c_4
root@ubuntu:~# docker logs 726f7082259d
Flag --insecure-port has been deprecated, This flag will be removed in a future version.
I1231 07:42:24.387492       1 server.go:560] external host was not specified, using 192.168.253.129
I1231 07:42:24.387615       1 server.go:147] Version: v1.15.11
I1231 07:42:24.591143       1 plugins.go:158] Loaded 10 mutating admission controller(s) successfully in the following order: NamespaceLifecycle,LimitRanger,ServiceAccount,NodeRestriction,TaintNodesByCondition,Priority,DefaultTolerationSeconds,DefaultStorageClass,StorageObjectInUseProtection,MutatingAdmissionWebhook.
I1231 07:42:24.591165       1 plugins.go:161] Loaded 6 validating admission controller(s) successfully in the following order: LimitRanger,ServiceAccount,Priority,PersistentVolumeClaimResize,ValidatingAdmissionWebhook,ResourceQuota.
E1231 07:42:24.591500       1 prometheus.go:55] failed to register depth metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591536       1 prometheus.go:68] failed to register adds metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591547       1 prometheus.go:82] failed to register latency metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591555       1 prometheus.go:96] failed to register workDuration metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591566       1 prometheus.go:112] failed to register unfinished metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591574       1 prometheus.go:126] failed to register unfinished metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591581       1 prometheus.go:152] failed to register depth metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591589       1 prometheus.go:164] failed to register adds metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591612       1 prometheus.go:176] failed to register latency metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591645       1 prometheus.go:188] failed to register work_duration metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591657       1 prometheus.go:203] failed to register unfinished_work_seconds metric admission_quota_controller: duplicate metrics collector registration attempted
E1231 07:42:24.591665       1 prometheus.go:216] failed to register longest_running_processor_microseconds metric admission_quota_controller: duplicate metrics collector registration attempted
I1231 07:42:24.591672       1 plugins.go:158] Loaded 10 mutating admission controller(s) successfully in the following order: NamespaceLifecycle,LimitRanger,ServiceAccount,NodeRestriction,TaintNodesByCondition,Priority,DefaultTolerationSeconds,DefaultStorageClass,StorageObjectInUseProtection,MutatingAdmissionWebhook.
I1231 07:42:24.591675       1 plugins.go:161] Loaded 6 validating admission controller(s) successfully in the following order: LimitRanger,ServiceAccount,Priority,PersistentVolumeClaimResize,ValidatingAdmissionWebhook,ResourceQuota.
I1231 07:42:24.592695       1 client.go:354] parsed scheme: ""
I1231 07:42:24.592714       1 client.go:354] scheme "" not registered, fallback to default scheme
I1231 07:42:24.592742       1 asm_amd64.s:1337] ccResolverWrapper: sending new addresses to cc: [{127.0.0.1:2379 0  <nil>}]
I1231 07:42:24.592819       1 asm_amd64.s:1337] balancerWrapper: got update addr from Notify: [{127.0.0.1:2379 <nil>}]
W1231 07:42:24.593011       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:25.593419       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
I1231 07:42:25.593607       1 client.go:354] parsed scheme: ""
I1231 07:42:25.593625       1 client.go:354] scheme "" not registered, fallback to default scheme
I1231 07:42:25.593646       1 asm_amd64.s:1337] ccResolverWrapper: sending new addresses to cc: [{127.0.0.1:2379 0  <nil>}]
I1231 07:42:25.593667       1 asm_amd64.s:1337] balancerWrapper: got update addr from Notify: [{127.0.0.1:2379 <nil>}]
W1231 07:42:25.593833       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:26.594735       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:27.246055       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:28.337241       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:29.517026       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:30.841745       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:34.271435       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:34.760027       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:41.465814       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W1231 07:42:42.279205       1 clientconn.go:1251] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 0  <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
I1231 07:42:44.594199       1 asm_amd64.s:1337] balancerWrapper: got update addr from Notify: []
F1231 07:42:44.594195       1 storage_decorator.go:57] Unable to create storage backend: config (&{ /registry {[https://127.0.0.1:2379] /etc/kubernetes/pki/apiserver-etcd-client.key /etc/kubernetes/pki/apiserver-etcd-client.crt /etc/kubernetes/pki/etcd/ca.crt} true 0xc0005deab0 apiextensions.k8s.io/v1beta1 <nil> 5m0s 1m0s}), err (dial tcp 127.0.0.1:2379: connect: connection refused)

etcd报错

root@ubuntu:~# docker logs be234e2baa2a
2020-12-31 07:43:57.251982 I | etcdmain: etcd Version: 3.3.10
2020-12-31 07:43:57.252049 I | etcdmain: Git SHA: 27fc7e2
2020-12-31 07:43:57.252052 I | etcdmain: Go Version: go1.10.4
2020-12-31 07:43:57.252053 I | etcdmain: Go OS/Arch: linux/amd64
2020-12-31 07:43:57.252055 I | etcdmain: setting maximum number of CPUs to 4, total number of available CPUs is 4
2020-12-31 07:43:57.252106 N | etcdmain: the server is already initialized as member before, starting as etcd member...
2020-12-31 07:43:57.252124 I | embed: peerTLS: cert = /etc/kubernetes/pki/etcd/peer.crt, key = /etc/kubernetes/pki/etcd/peer.key, ca = , trusted-ca = /etc/kubernetes/pki/etcd/ca.crt, client-cert-auth = true, crl-file = 
2020-12-31 07:43:57.252565 I | embed: listening for peers on https://192.168.253.129:2380
2020-12-31 07:43:57.252611 I | embed: listening for client requests on 127.0.0.1:2379
2020-12-31 07:43:57.252623 I | embed: listening for client requests on 192.168.253.129:2379
2020-12-31 07:43:57.254331 I | etcdserver: recovered store from snapshot at index 160016
2020-12-31 07:43:57.255652 C | etcdserver: recovering backend from snapshot error: database snapshot file path error: snap: snapshot file doesn't exist
panic: recovering backend from snapshot error: database snapshot file path error: snap: snapshot file doesn't exist
	panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x20 pc=0xb8cb90]

goroutine 1 [running]:
github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdserver.NewServer.func1(0xc4202a7ca0, 0xc4202a7758)
	/tmp/etcd-release-3.3.10/etcd/release/etcd/gopath/src/github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdserver/server.go:291 +0x40
panic(0xde0ce0, 0xc4201ba510)
	/usr/local/go/src/runtime/panic.go:502 +0x229
github.com/coreos/etcd/cmd/vendor/github.com/coreos/pkg/capnslog.(*PackageLogger).Panicf(0xc4202aa6c0, 0xfe8789, 0x2a, 0xc4202a77f8, 0x1, 0x1)
	/tmp/etcd-release-3.3.10/etcd/release/etcd/gopath/src/github.com/coreos/etcd/cmd/vendor/github.com/coreos/pkg/capnslog/pkg_logger.go:75 +0x162
github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdserver.NewServer(0x7ffc842cae7e, 0x6, 0x0, 0x0, 0x0, 0x0, 0xc42028ec00, 0x1, 0x1, 0xc42028ed00, ...)
	/tmp/etcd-release-3.3.10/etcd/release/etcd/gopath/src/github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdserver/server.go:386 +0x26bb
github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/embed.StartEtcd(0xc420276900, 0xc420276d80, 0x0, 0x0)
	/tmp/etcd-release-3.3.10/etcd/release/etcd/gopath/src/github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/embed/etcd.go:179 +0x811
github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdmain.startEtcd(0xc420276900, 0xfc62b7, 0x6, 0xc4202a8d01, 0x2)
	/tmp/etcd-release-3.3.10/etcd/release/etcd/gopath/src/github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdmain/etcd.go:181 +0x40
github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdmain.startEtcdOrProxyV2()
	/tmp/etcd-release-3.3.10/etcd/release/etcd/gopath/src/github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdmain/etcd.go:102 +0x1369
github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdmain.Main()
	/tmp/etcd-release-3.3.10/etcd/release/etcd/gopath/src/github.com/coreos/etcd/cmd/vendor/github.com/coreos/etcd/etcdmain/main.go:46 +0x3f
main.main()
	/tmp/etcd-release-3.3.10/etcd/release/etcd/gopath/src/github.com/coreos/etcd/cmd/etcd/main.go:28 +0x20

暴力解决

rm -rf /var/lib/etcd/*

etcd数据定时备份

backup.sh

#!/bin/sh

cd /var/lib 
name="etcd-bak"`date "+%Y%m%d"`
tar -cvf "/home/etcd/"$name".tar.gz" etcd

crontab

SHELL=/bin/bash
PATH=/sbin:/bin:/usr/sbin:/usr/bin
59 23 * * * (/path/to/backup.sh)
Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐