k8s集群
kubernets 1.24版本后默认使用containerd做底层容器,需要使用cri-dockerd做中间层来与docker通信。浏览器访问集群ip:端口(https://192.168.136.161:30081/),注意https。,这是node-exporter采集的数据。,这是Prometheus的页面,依次点击。,这是grafana的页面,账户、密码都是。出现下面的错误,集群当前没有
环境
centos:7.9
k8s-v1.28
docker-v26.1.4
mysql-8.0.21
prometheus
grafana
3台master节点,3台node节点,一台nfs服务器,一台mysql服务器
环境初始化
# 重命名
[root@localhost ~]# hostnamectl set-hostname master1
[root@localhost ~]# su
su
#配置静态IP
[root@master1 ~]# cd /etc/sysconfig/network-scripts/
[root@master1 network-scripts]# vim ifcfg-ens33
BOOTPROTO="none"
NAME="ens33"
DEVICE="ens33"
ONBOOT="yes"
IPADDR=192.168.136.161
PREFIX=24
GATEWAY=192.168.136.2
DNS1=114.114.114.114
DNS2=101.226.4.6
[root@master1 network-scripts]# service network restart
Restarting network (via systemctl): [ 确定 ]
#检查IP地址
[root@master1 network-scripts]# ip add
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: ens33: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 00:0c:29:2f:4d:2c brd ff:ff:ff:ff:ff:ff
inet 192.168.136.161/24 brd 192.168.136.255 scope global noprefixroute ens33
valid_lft forever preferred_lft forever
inet6 fe80::20c:29ff:fe2f:4d2c/64 scope link
valid_lft forever preferred_lft forever
#查看路由
[root@master1 network-scripts]# ip route
default via 192.168.136.2 dev ens33 proto static metric 100
192.168.136.0/24 dev ens33 proto kernel scope link src 192.168.136.153 metric 100
#查看本地DNS服务器
[root@master1 network-scripts]# cat /etc/resolv.conf
# Generated by NetworkManager
nameserver 114.114.114.114
nameserver 101.226.4.6
配置hosts文件
vim /etc/hosts
cat /etc/hosts
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.136.161 master1
192.168.136.162 master2
192.168.136.164 node1
192.168.136.165 node2
192.168.136.166 node3
更新和配置软件源
[root@master1 network-scripts]# cd /etc/yum.repos.d/
[root@master1 yum.repos.d]# rm -f *
[root@master1 yum.repos.d]# curl -O http://mirrors.aliyun.com/repo/Centos-7.repo
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 2523 100 2523 0 0 30244 0 --:--:-- --:--:-- --:--:-- 30768
[root@master1 yum.repos.d]# ls
Centos-7.repo
[root@master1 yum.repos.d]# yum makecache fast
已加载插件:fastestmirror
Determining fastest mirrors
* base: mirrors.aliyun.com
* extras: mirrors.aliyun.com
* updates: mirrors.aliyun.com
关闭firewalld和selinux
[root@master1 yum.repos.d]# systemctl stop firewalld
[root@master1 yum.repos.d]# systemctl disable firewalld
Removed symlink /etc/systemd/system/multi-user.target.wants/firewalld.service.
Removed symlink /etc/systemd/system/dbus-org.fedoraproject.FirewallD1.service.
#临时关闭selinux
[root@master1 yum.repos.d]# setenforce 0
#永久关闭
[root@master1 yum.repos.d]# sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config
[root@master1 yum.repos.d]# reboot
[root@master1 ~]# getenforce
Disabled
关闭交换分区
#临时关闭
[root@master1 ~]# swapoff -a
#永久关闭
[root@master1 ~]# vim /etc/fstab
#/dev/mapper/centos-swap swap swap defaults 0 0
调整内核参数
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
sysctl -p
modprobe br_netfilter
lsmod | grep br_netfilter
配置ipvs功能
# 安装ipset和ipvsadm
yum install ipset ipvsadmin -y
# 添加需要加载的模块写入脚本文件
cat <<EOF > /etc/sysconfig/modules/ipvs.modules
#!/bin/bash
modprobe -- ip_vs
modprobe -- ip_vs_rr
modprobe -- ip_vs_wrr
modprobe -- ip_vs_sh
modprobe -- nf_conntrack_ipv4
EOF
# 为脚本文件添加执行权限
chmod +x /etc/sysconfig/modules/ipvs.modules
# 执行脚本文件
/bin/bash /etc/sysconfig/modules/ipvs.modules
# 查看对应的模块是否加载成功
lsmod | grep -e ip_vs -e nf_conntrack_ipv4
# 重启
reboot
配置时间同步
# 跟网络时间做同步
ntpdate ntp.cloud.aliyuncs.com
# 添加计划任务
crontab -e
* */1 * * * /usr/sbin/ntpdate ntp.cloud.aliyuncs.com
# 重启crond服务
service crond restart
配置docker环境
下载安装docker的仓库文件
cd /etc/yum.repos.d/
curl -O https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
ls
Centos-7.repo docker-ce.repo
安装docker环境
yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
#查看docker版本
[root@master1 yum.repos.d]# docker -v
Docker version 26.1.4, build 5650f9b
#启动docker
[root@master1 yum.repos.d]# systemctl start docker
#设置开机自启
[root@master1 yum.repos.d]# systemctl enable docker
Created symlink from /etc/systemd/system/multi-user.target.wants/docker.service to /usr/lib/systemd/system/docker.service.
#查看docker状态
[root@master1 yum.repos.d]# systemctl status docker
配置docker镜像加速
[root@master1 yum.repos.d]# vim /etc/docker/daemon.json
{
"registry-mirrors": ["https://hub.docker-alhk.dkdun.com/"],
"exec-opts": ["native.cgroupdriver=systemd"] #指定cgroup的驱动程序是systemd
}
#重新加载docker的配置文件和重启docker服务
[root@master1 yum.repos.d]# systemctl daemon-reload
[root@master1 yum.repos.d]# systemctl restart docker
配置cri-docker
kubernets 1.24版本后默认使用containerd做底层容器,需要使用cri-dockerd做中间层来与docker通信
mkdir /cri-docker
cd /cri-docker/
# 下载
wget https://github.com/Mirantis/cri-dockerd/releases/download/v0.3.8/cri-dockerd-0.3.8-3.el7.x86_64.rpm
# 安装
rpm -ivh cri-dockerd-0.3.8-3.el7.x86_64.rpm
# 重载系统守护进程
systemctl daemon-reload
# 修改配置文件
vim /usr/lib/systemd/system/cri-docker.service
# 修改第10行 ExecStart
# 改为
ExecStart=/usr/bin/cri-dockerd --pod-infra-container-image=registry.aliyuncs.com/google_containers/pause:3.9 --container-runtime-endpoint fd://
配置cri-docker服务自启动
# 重载系统守护进程
systemctl daemon-reload
# 启动cri-dockerd
systemctl start cri-docker.socket cri-docker
# 设置cri-dockerd自启动
systemctl enable cri-docker.socket cri-docker
# 检查Docker组件状态
systemctl status docker cir-docker.socket cri-docker
配置k8s集群环境
安装kubectl
# 下载
curl -LO "https://dl.k8s.io/release/v1.28.2/bin/linux/amd64/kubectl"
# 检验 可选
curl -LO "https://dl.k8s.io/v1.28.2/bin/linux/amd64/kubectl.sha256"
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check
# 安装
install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
# 测试
kubectl version --client
#Client Version: v1.28.2
#Kustomize Version: v5.0.4-0.20230601165947-6ce0bf390ce3
配置k8s组件源
cat <<EOF | tee /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64
enabled=1
gpgcheck=0
repo_gpgcheck=0
gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
EOF
yum makecache
安装
# 安装
yum install -y install kubeadm-1.28.2-0 kubelet-1.28.2-0 kubectl-1.28.2-0 --disableexcludes=kubernetes
# 如果报错未找到就试试不指定版本
yum install -y install kubeadm kubelet kubectl --disableexcludes=kubernetes
# 设置自启动
systemctl enable --now kubelet
集群初始化
在master节点执行
kubeadm init --kubernetes-version=v1.28.2 \
--pod-network-cidr=10.224.0.0/16 \
--apiserver-advertise-address=192.168.136.161 \
--image-repository=registry.aliyuncs.com/google_containers \
--cri-socket=unix:///var/run/cri-dockerd.sock
#192.168.136.161为master1的IP
swap 未关闭报错
kubectl 未启动报错
之后启动后再执行命令还是报错,端口 6443, 10259, 10257, 10250, 2379, 2380 都被占用了,有写yaml文件已存在, etcd 数据目录不为空
ps aux |grep 6443
kill -9 3058
......
删除已存在的yaml文件
rm /etc/kubernetes/manifests/kube-apiserver.yaml
rm /etc/kubernetes/manifests/kube-controller-manager.yaml
rm /etc/kubernetes/manifests/kube-scheduler.yaml
rm /etc/kubernetes/manifests/etcd.yaml
清空 etcd 数据目录
rm -rf /var/lib/etcd/*
成功后会提示以下信息:
[addons] Applied essential addon: CoreDNS
[addons] Applied essential addon: kube-proxy
Your Kubernetes control-plane has initialized successfully!
To start using your cluster, you need to run the following as a regular user:
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
Alternatively, if you are the root user, you can run:
export KUBECONFIG=/etc/kubernetes/admin.conf
You should now deploy a pod network to the cluster.
Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at:
https://kubernetes.io/docs/concepts/cluster-administration/addons/
Then you can join any number of worker nodes by running the following on each as root:
kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu \
--discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e
记下系统提示命令kubeadm join xxxxx
,并在后面追加unix:///var/run/cri-dockerd.sock
完整命令应该类似于:kubeadm join 192.168.136.161:6443 --token xxx --discovery-token-ca-cert-hash sha256:xxx --cri-socket unix:///var/run/cri-dockerd.sock
mkdir -p $HOME/.kube
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
chown $(id -u):$(id -g) $HOME/.kube/config
查找token和certificate key
[root@master1 .kube]# kubeadm init phase upload-certs --upload-certs
I0816 16:16:42.129855 8453 version.go:256] remote version is much newer: v1.31.0; falling back to: stable-1.28
[upload-certs] Storing the certificates in Secret "kubeadm-certs" in the "kube-system" Namespace
[upload-certs] Using certificate key:
b5ecc6f24110ac0e31a1026a74fe95ad4e2fdab1c6dba919b7f651c9d7dd265f
[root@master1 .kube]# kubeadm token create --print-join-command
kubeadm join 192.168.136.161:6443 --token btyzsw.3o1zswwdm0v904pr --discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e
node节点加入集群
# 上面得到的命令
kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu \
--discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e \
--cri-socket unix:///var/run/cri-dockerd.sock
[preflight] Running pre-flight checks
[preflight] Reading configuration from the cluster...
[preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -o yaml'
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Starting the kubelet
[kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap...
This node has joined the cluster:
* Certificate signing request was sent to apiserver and a response was received.
* The Kubelet was informed of the new secure connection details.
Run 'kubectl get nodes' on the control-plane to see this node join the cluster.
master节点加入集群
#参考文档
#https://kubernetes.io/zh-cn/docs/reference/setup-tools/kubeadm/kubeadm-join/
#在node节点加入集群命令中添加参数 --control-plane
kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu \
--discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e \
--control-plane \
--cri-socket unix:///var/run/cri-dockerd.sock
出现下面的错误,集群当前没有配置稳定的 controlPlaneEndpoint
地址
这通常是在集群初始化时通过 kubeadm init
命令的 --control-plane-endpoint
参数来设置的。
[root@master2 ~]# kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu \
> --discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e \
> --control-plane \
> --cri-socket unix:///var/run/cri-dockerd.sock
[preflight] Running pre-flight checks
[preflight] Reading configuration from the cluster...
[preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -o yaml'
error execution phase preflight:
One or more conditions for hosting a new control plane instance is not satisfied.
unable to add a new control plane instance to a cluster that doesn't have a stable controlPlaneEndpoint address
Please ensure that:
* The cluster has a stable controlPlaneEndpoint address.
* The certificates that must be shared among control plane instances are provided.
To see the stack trace of this error execute with --v=5 or higher
解决方法
[root@master1 ~]# kubectl get cm kubeadm-config -n kube-system
NAME DATA AGE
kubeadm-config 1 33m
[root@master1 ~]# kubectl describe cm kubeadm-config -n kube-system
Name: kubeadm-config
Namespace: kube-system
Labels: <none>
Annotations: <none>
Data
====
ClusterConfiguration:
----
apiServer:
extraArgs:
authorization-mode: Node,RBAC
timeoutForControlPlane: 4m0s
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
controllerManager: {}
dns: {}
etcd:
local:
dataDir: /var/lib/etcd
imageRepository: registry.aliyuncs.com/google_containers
kind: ClusterConfiguration
kubernetesVersion: v1.28.2
networking:
dnsDomain: cluster.local
podSubnet: 10.224.0.0/16
serviceSubnet: 10.96.0.0/12
scheduler: {}
BinaryData
====
Events: <none>
[root@master1 ~]# kubectl edit cm kubeadm-config -n kube-system
configmap/kubeadm-config edited
master2执行命令加入节点
kubeadm join 192.168.136.161:6443 --token rokdqu.7bphgp43lhf0tteu --discovery-token-ca-cert-hash sha256:c0b81b87cd45cd030bf7674b3d25a2d06dbd20ea78817f2461e9ddcbcf1c7f2e --control-plane --certificate-key b5ecc6f24110ac0e31a1026a74fe95ad4e2fdab1c6dba919b7f651c9d7dd265f --cri-socket unix:///var/run/cri-dockerd.sockkube
完成后查看节点
[root@master1 ~]# kubectl get node
NAME STATUS ROLES AGE VERSION
master1 NotReady control-plane 119m v1.28.2
master2 NotReady control-plane 7m49s v1.28.2
node1 NotReady <none> 107m v1.28.2
node2 NotReady <none> 107m v1.28.2
node3 NotReady <none> 107m v1.28.2
分配worker
# 在master上执行
kubectl label node node1 node-role.kubernetes.io/worker=worker
kubectl label node node2 node-role.kubernetes.io/worker=worker
kubectl label node node3 node-role.kubernetes.io/worker=worker
安装Calico网络插件
# master执行
wget https://docs.projectcalico.org/manifests/calico.yaml
kubectl apply -f calico.yaml
# 验证
kubectl get nodes
NAME STATUS ROLES AGE VERSION
master1 Ready control-plane 126m v1.28.2
master2 Ready control-plane 14m v1.28.2
node1 Ready worker 114m v1.28.2
node2 Ready worker 114m v1.28.2
node3 Ready worker 114m v1.28.2
#查看pod
root@master1 ~]# kubectl get pod -A
NAMESPACE NAME READY STATUS RESTARTS AGE
kube-system calico-kube-controllers-658d97c59c-prwkv 1/1 Running 0 5m25s
kube-system calico-node-2kdfk 1/1 Running 0 5m25s
kube-system calico-node-47hcn 1/1 Running 0 5m25s
kube-system calico-node-4pc5c 1/1 Running 0 5m25s
kube-system calico-node-nsqfv 1/1 Running 0 5m25s
kube-system calico-node-vltbx 1/1 Running 0 5m25s
kube-system coredns-66f779496c-k2hf8 1/1 Running 0 127m
kube-system coredns-66f779496c-sr9rc 1/1 Running 0 127m
kube-system etcd-master1 1/1 Running 1 (129m ago) 127m
kube-system etcd-master2 1/1 Running 0 15m
kube-system kube-apiserver-master1 1/1 Running 1 (130m ago) 127m
kube-system kube-apiserver-master2 1/1 Running 0 15m
kube-system kube-controller-manager-master1 1/1 Running 2 (15m ago) 127m
kube-system kube-controller-manager-master2 1/1 Running 0 15m
kube-system kube-proxy-7w9qw 1/1 Running 0 15m
kube-system kube-proxy-8bb5g 1/1 Running 0 115m
kube-system kube-proxy-b8r8z 1/1 Running 0 115m
kube-system kube-proxy-cbhx4 1/1 Running 0 127m
kube-system kube-proxy-dg65j 1/1 Running 0 115m
kube-system kube-scheduler-master1 1/1 Running 2 (15m ago) 127m
kube-system kube-scheduler-master2 1/1 Running 0 15m
安装Dashboard
以下命令均只在master节点上执行
下载安装
wget https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml
vim recommended.yaml
修改Service部分,改为NodePort对外暴露端口
The range of valid ports is 30000-32767
39 spec:
40 type: NodePort
41 ports:
42 - port: 443
43 targetPort: 8443
44 nodePort: 30081
45 selector:
46 k8s-app: kubernetes-dashboard
安装
kubectl apply -f recommended.yaml
查看
[root@master1 ~]# kubectl get pods,svc -n kubernetes-dashboard
NAME READY STATUS RESTARTS AGE
pod/dashboard-metrics-scraper-5657497c4c-zlfz6 1/1 Running 0 88s
pod/kubernetes-dashboard-78f87ddfc-9zrss 1/1 Running 0 88s
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/dashboard-metrics-scraper ClusterIP 10.99.245.129 <none> 8000/TCP 88s
service/kubernetes-dashboard NodePort 10.98.7.173 <none> 443:30081/TCP 12s
创建账号
创建dashboard-access-token.yaml文件
# Creating a Service Account
apiVersion: v1
kind: ServiceAccount
metadata:
name: admin-user
namespace: kubernetes-dashboard
---
# Creating a ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: admin-user
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: admin-user
namespace: kubernetes-dashboard
---
# Getting a long-lived Bearer Token for ServiceAccount
apiVersion: v1
kind: Secret
metadata:
name: admin-user
namespace: kubernetes-dashboard
annotations:
kubernetes.io/service-account.name: "admin-user"
type: kubernetes.io/service-account-token
# Clean up and next steps
# kubectl -n kubernetes-dashboard delete serviceaccount admin-user
# kubectl -n kubernetes-dashboard delete clusterrolebinding admin-user
执行
kubectl apply -f dashboard-access-token.yaml
#获取token
kubectl get secret admin-user -n kubernetes-dashboard -o jsonpath={".data.token"} | base64 -d
访问dashboard
[root@master1 ~]# kubectl get secret -n kubernetes-dashboard
NAME TYPE DATA AGE
admin-user kubernetes.io/service-account-token 3 49s
kubernetes-dashboard-certs Opaque 0 4m58s
kubernetes-dashboard-csrf Opaque 1 4m58s
kubernetes-dashboard-key-holder Opaque 2 4m58s
[root@master1 ~]# kubectl get svc -n kubernetes-dashboard
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
dashboard-metrics-scraper ClusterIP 10.99.245.129 <none> 8000/TCP 5m20s
kubernetes-dashboard NodePort 10.98.7.173 <none> 443:30081/TCP 4m4s
浏览器访问集群ip:端口(https://192.168.136.161:30081/),注意https
输入上一步获取到的token即可
解决token默认15分钟过期的问题
[root@k8s-master-1 ~]# vim recommended.yaml
193 containers:
194 - name: kubernetes-dashboard
195 image: kubernetesui/dashboard:v2.7.0
196 imagePullPolicy: Always
197 ports:
198 - containerPort: 8443
199 protocol: TCP
200 args:
201 - --auto-generate-certificates
202 - --namespace=kubernetes-dashboard
203 - --token-ttl=43200 #添加这条配置,超时时间调整为12小时
重新应用
[root@k8s-master-1 ~]# kubectl apply -f recommended.yaml
安装kubectl命令自动补全
yum install bash-completion -y
# 临时设置自动补全
source <(kubectl completion bash)
# 永久设置自动补全
echo "source <(kubectl completion bash)" >> ~/.bashrc && bash
部署metric-server
下载
master执行
wget https://github.com/kubernetes-sigs/metrics-server/releases/download/v0.6.2/components.yaml
修改
vim
修改140行左右
原:
containers:
- args:
...
image: k8s.gcr.io/metrics-server/metrics-server:v0.6.2
修改后:
containers:
- args:
...
- --kubelet-insecure-tls # 添加这一行
image: admin4j/metrics-server:v0.6.2 # 修改镜像仓库地址
应用
kubectl apply -f components.yaml
查看
[root@master1 ~]# kubectl top nodes
NAME CPU(cores) CPU% MEMORY(bytes) MEMORY%
master1 180m 9% 1143Mi 66%
master2 147m 7% 1050Mi 61%
node1 78m 3% 866Mi 50%
node2 79m 3% 894Mi 52%
node3 81m 4% 914Mi 53%
目前问题,master1和master2都需要启动,否则会报错
#master1上报错
[root@master1 ~]# kubectl get nodes
Error from server: etcdserver: request timed out
#master2上报错
root@master2 ~]# kubectl get pod -o wide -A
Unable to connect to the server: dial tcp 192.168.136.161:6443: i/o timeout
Prometheus监控k8s
监控方案
Cadvisor + node-exporter + prometheus + grafana
- Cadvisor:数据采集
- node-exporter:汇总
- prometheus:处理、存储
- grafana:展示
监控流程
- 容器监控:Prometheus使用cadvisor采集容器监控指标,而cadvisor集成在K8S的kubelet中所以无需部署,通过Prometheus进程存储,使用grafana进行展示。
- node节点监控:node端的监控通过node_exporter采集当前主机的资源,通过Prometheus进程存储,最后使用grafana进行展示。
- master节点监控:master的监控通过kube-state-metrics插件从K8S获取到apiserver的相关数据并通过网页页面暴露出来,然后通过Prometheus进程存储,最后使用grafana进行展示
#上传yaml文件
[root@master1 prometheus-k8s]# ls
configmap.yaml csdn—prometheus监控k8s.txt grafana-deploy.yaml grafana-ing.yaml grafana-svc.yaml node-exporter.yaml prometheus.deploy.yml prometheus.svc.yml rbac-setup.yaml
#修改prometheus.deploy.yml,镜像版本修改为 - image: prom/prometheus
#node-exporter.yaml,prometheus.deploy.yml,grafana-deploy.yaml
#提前在节点拉取镜像
docker pull prom/node-exporter
docker pull prom/prometheus
docker pull grafana/grafana:6.1.4
采用daemonset方式部署node-exporter
[root@master1 prometheus-k8s]# kubectl apply -f node-exporter.yaml
daemonset.apps/node-exporter created
service/node-exporter created
[root@master1 prometheus-k8s]# kubectl get pods -A -o wide |grep exporter
kube-system node-exporter-5kfl9 1/1 Running 0 60s 10.224.135.7 node3 <none> <none>
kube-system node-exporter-ckg9f 1/1 Running 0 60s 10.224.104.3 node2 <none> <none>
kube-system node-exporter-s2frk 1/1 Running 0 60s 10.224.166.131 node1 <none> <none>
[root@master1 prometheus-k8s]# kubectl get daemonset -A
NAMESPACE NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
kube-system calico-node 5 5 5 5 5 kubernetes.io/os=linux 18h
kube-system kube-proxy 5 5 5 5 5 kubernetes.io/os=linux 20h
kube-system node-exporter 3 3 3 3 3 <none> 2m41s
[root@master1 prometheus-k8s]# kubectl get svc -A
NAMESPACE NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
default kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 20h
kube-system kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP,9153/TCP 20h
kube-system metrics-server ClusterIP 10.103.190.182 <none> 443/TCP 17h
kube-system node-exporter NodePort 10.99.110.5 <none> 9100:31672/TCP 2m48s
kubernetes-dashboard dashboard-metrics-scraper ClusterIP 10.99.245.129 <none> 8000/TCP 18h
kubernetes-dashboard kubernetes-dashboard NodePort 10.98.7.173 <none> 443:30081/TCP 18h
部署Prometheus
[root@master1 prometheus-k8s]# kubectl apply -f rbac-setup.yaml
clusterrole.rbac.authorization.k8s.io/prometheus created
serviceaccount/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
[root@master1 prometheus-k8s]# kubectl apply -f configmap.yaml
configmap/prometheus-config created
[root@master1 prometheus-k8s]# kubectl apply -f prometheus.deploy.yml
deployment.apps/prometheus created
[root@master1 prometheus-k8s]# kubectl apply -f prometheus.svc.yml
service/prometheus created
部署grafana
[root@master1 prometheus-k8s]# kubectl apply -f grafana-deploy.yaml
deployment.apps/grafana-core created
[root@master1 prometheus-k8s]# kubectl apply -f grafana-svc.yaml
service/grafana created
[root@master1 prometheus-k8s]# kubectl apply -f grafana-ing.yaml
[root@master1 prometheus-k8s]# cd ..
[root@master1 k8s-prometheus-grafana-master]# kubectl apply -f grafana-ing.yaml
ingress.networking.k8s.io/grafana created
校验测试
查看pod/svc信息
[root@master1 prometheus-k8s]# kubectl get pods -A -o wide
[root@master1 prometheus-k8s]# kubectl get svc -A
查看prometheus
访问http://192.168.136.165:30003
,这是Prometheus的页面,依次点击Status>Targets
可以看到已经成功连接到k8s的apiserver
查看node exporter
访问http://192.168.136.161:31672/metrics
,这是node-exporter采集的数据。
查看grafana
访问http://192.168.136.165:30950
,这是grafana的页面,账户、密码都是admin。
创建Dashboard
add data souce
部署NFS
#配置静态IP:192.168.136.167
vim /etc/sysconfig/network-scripts/ifcfg-ens33
#重命名 k8s-nfs
hostnamectl set-hostname k8s-nfs
su
#关闭防火墙
systemctl stop firewalld
systemctl disable firewalld
在nfs服务器,部署web的node节点上安装nfs-utils
yum install nfs-utils -y
只启动nfs上nfs服务
service nfs start
systemctl enable nfs
设置共享目录
[root@k8s-nfs ~]# vim /etc/exports
[root@k8s-nfs ~]# cat /etc/exports
/web/html 192.168.136.0/24(rw,sync,all_squash)
刷新服务,输出共享目录
[root@k8s-nfs ~]# mkdir -p /web/html
[root@k8s-nfs ~]# exportfs -rv
exporting 192.168.136.0/24:/web/html
[root@k8s-nfs ~]# cd /web/
#设置/web文件夹的权限,允许其他人过来读写
[root@k8s-nfs web]# chown nfsnobody:nfsnobody html/
测试
#挂载
[root@node2 /]# mount 192.168.136.167:/web/html /html
[root@node2 html]# touch a.txt
[root@node2 html]# df -Th|grep nfs
192.168.136.167:/web/html nfs4 50G 1.8G 49G 4% /html
# 查看
[root@k8s-nfs web]# cd html/
[root@k8s-nfs html]# ls
a.txt
上传前端文件
#打包前端文件
#若是用vue框架开发
npm run build
#将打包后的文件上传
[root@k8s-nfs html]# ls
css favicon.ico img index.html js
创建PV
[root@master1 web-yaml]# kubectl apply -f pv-nfs.yaml
persistentvolume/pv-nfs-web created
#pv-nfs.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-nfs-web
labels:
app: web
environment: production
spec:
capacity:
storage: 50Gi
accessModes:
- ReadWriteMany
storageClassName: "nfs"
nfs:
path: "/web/html"
server: 192.168.136.167
readOnly: false
创建PVC
#pvc-nfs.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc-nfs-web
labels:
app: nginx
environment: production
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 888Mi
storageClassName: "nfs"
[root@master1 web-yaml]# kubectl apply -f pvc-nfs.yaml
persistentvolumeclaim/pvc-nfs-web created
#查看pv
[root@master1 web-yaml]# kubectl get pv
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
pv-nfs-web 50Gi RWX Retain Bound default/pvc-nfs-web nfs 13s
#查看pvc
[root@master1 web-yaml]# kubectl get pvc
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
pvc-nfs-web Bound pv-nfs-web 50Gi RWX nfs 19s
sed -i 's/114.114.114.114/192.168.136.2/g' /etc/resolv.conf
还需要构建镜像,部署ingress,探针
安装jdk17
不用安装也可以
#安装jdk17
# 官网下载压缩包
https://download.oracle.com/java/17/latest/jdk-17_linux-x64_bin.tar.gz
# 解压
tar -xf jdk-17_linux-x64_bin.tar.gz
mv jdk-17.0.12/ /usr/local/jdk17
# 添加以下内容
vim /etc/profile
export JAVA_HOME=/usr/local/jdk17
export CLASSPATH=$JAVA_HOME/lib
export PATH=$JAVA_HOME/bin:$PATH
# 使用以下命令使配置生效
source /etc/profile
[root@node1 ~]# java -version
java version "17.0.12" 2024-07-16 LTS
Java(TM) SE Runtime Environment (build 17.0.12+8-LTS-286)
Java HotSpot(TM) 64-Bit Server VM (build 17.0.12+8-LTS-286, mixed mode, sharing)
安装golang
yum install eple-release -y
yum install golang -y
# 查看版本
[root@node1 ~]# go version
go version go1.20.12 linux/amd64
安装mysql
[root@db ~]# cat onekey_install_mysql8.sh
#!/bin/bash
cd /usr/local
yum install wget -y
wget https://dev.mysql.com/get/Downloads/MySQL-8.0/mysql-8.0.21-linux-glibc2.12-x86_64.tar.xz
tar xvJf mysql-8.0.21-linux-glibc2.12-x86_64.tar.xz
mv mysql-8.0.21-linux-glibc2.12-x86_64 mysql-8.0
cd mysql-8.0
mkdir data
id mysql|| useradd mysql -s /sbin/nologin
chown -R mysql.mysql /usr/local/mysql-8.0
cd bin
./mysqld --user=mysql --basedir=/usr/local/mysql-8.0 --datadir=/usr/local/mysql-8.0/data/ --initialize
cat > /etc/my.cnf <<EOF
[mysqld]
basedir=/usr/local/mysql-8.0/
datadir=/usr/local/mysql-8.0/data/
socket=/tmp/mysql.sock
character-set-server=UTF8MB4
symbolic-links=0
!includedir /etc/my.cnf.d
EOF
cd ..
cp -a ./support-files/mysql.server /etc/init.d/mysql
chmod +x /etc/init.d/mysql
chkconfig --add mysql
service mysql start
ln -s /usr/local/mysql-8.0/bin/mysql /usr/bin
# 创建数据库
CREATE DATABASE savor CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
# 初始化
[root@k8s-db mysql]# mysql -uroot -p savor < savor.sql
Enter password:
# 更改密码(如果 root 用户已经存在)
ALTER USER 'root'@'%' IDENTIFIED BY '123456';
# 创建root 用户用于远程主机上
CREATE USER 'root'@'%' IDENTIFIED BY '123456';
# 授予权限
GRANT ALL PRIVILEGES ON poetize.* TO 'root'@'%';
FLUSH PRIVILEGES;
构建二进制文件
go mod init savor_gin
#国内镜像站点,阿里云的 Go 代理
export GOPROXY=https://mirrors.aliyun.com/goproxy/
# 或者腾讯云 Go 代理
export GOPROXY=https://goproxy.cn
# 解决依赖
go mod tidy
#制作二进制文件
[root@node1 go_src]# go build -o savor main.go
[root@node1 go_src]# ls
conf controllers errorType go.mod go.sum logs main.go middlewares models router savor utils
# 测试能否运行
[root@node1 go_src]# ./savor
[GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
[GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
- using env: export GIN_MODE=release
- using code: gin.SetMode(gin.ReleaseMode)
# 浏览器访问
http://192.168.136.164:9000/region/list
构建镜像,这里是java的,(跳过)
# 创建文件夹存放资源
mkdir /java-blog
# 上传打包好的jar包,并且测试是否能连接到数据库
java -jar poetize-server.jar
# 编写Dockerfile
vim Dockerfile
FROM openjdk:17-jdk-slim
WORKDIR /app
COPY poetize-server.jar /app/
ENTRYPOINT ["java","-jar","poetize-server.jar"]
[root@node1 java-blog]# ls
Dockerfile poetize-server.jar
# 提前拉取基础镜像
docker pull openjdk:17-jdk-slim
# 开始构建镜像
[root@node1 java-blog]# docker build -t java-blog:1.0 .
[+] Building 0.6s (8/8) FINISHED
构建镜像,Go
# 创建文件夹存放制作镜像的材料
[root@node1 go_src]# mkdir /DockerFile
[root@node1 go_src]# cd /DockerFile/
[root@node1 DockerFile]# cp /go_src/savor .
[root@node1 DockerFile]# touch Dockerfile
[root@node1 DockerFile]# mkdir conf
[root@node1 DockerFile]# cp /go_src/conf/config.yaml conf/
[root@node1 DockerFile]# ls conf/
config.yaml
[root@node1 DockerFile]# ls
conf Dockerfile savor
[root@node1 DockerFile]# cat Dockerfile
# 使用 centos:7作为基础镜像
FROM centos:7
# 设置工作目录
WORKDIR /go
# 复制二进制文件到容器内
COPY . /go
RUN chmod +x /go/savor
# 暴露端口
EXPOSE 9000
# 启动命令
ENTRYPOINT ["/go/savor"]
################
# 制作镜像
docker build -t savor:1.0 .
测试
# 创建容器并启动
docker run -d -p 9000:9000 --name savor-1 savor:1.0
# 查看容器
docker ps |grep savor
57085b4129a5 savor:1.0 "/go/savor" 2 minutes ago Up 2 minutes 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp savor-1
# 查看日志
[root@node1 java-blog]# docker logs 57085b4129a5
将镜像传到其他服务器
#导出镜像
docker save -o savor.tar savor:1.0
#上传只其他Node节点
scp savor.tar 192.168.136.165:/root
scp savor.tar 192.168.136.166:/root
# 在其他node节点导入镜像
docker load -i savor.tar
创建Deployment和Service
[root@master1 ~]# vim deplay-probe-savor.yaml
[root@master1 ~]# cat deplay-probe-savor.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: savor-deploy
spec:
replicas: 3
selector:
matchLabels:
app: savor
template:
metadata:
labels:
app: savor
spec:
containers:
- name: savor-container
imagePullPolicy: IfNotPresent
image: savor:1.0
ports:
- containerPort: 9000
# 添加资源限制和请求
resources:
limits:
cpu: 500m
memory: "512Mi"
requests:
cpu: 200m
memory: "128Mi"
# 添加存活探针
livenessProbe:
httpGet:
path: /region/list
port: 9000
initialDelaySeconds: 30 # 探针开始前等待的时间秒数
periodSeconds: 10 # 探针执行间隔时间秒数
timeoutSeconds: 5 # 探针超时时间秒数
failureThreshold: 3 # 失败次数阈值,达到此值后认为容器不健康
successThreshold: 1 # 成功次数阈值,连续成功此次数认为容器健康
---
apiVersion: v1
kind: Service
metadata:
name: savor-service
spec:
selector:
app: savor # 匹配Deployment中的标签
ports:
- protocol: TCP
port: 9000 # Service暴露的端口
targetPort: 9000 # Pod内部服务监听的端口
nodePort: 30000 # 指定节点端口(可选)
type: NodePort # NodePort类型,使服务在每个节点的指定端口上可用
创建pod,并配置HPA
[root@master1 ~]# kubectl apply -f deplay-probe-savor.yaml
deployment.apps/savor-deploy created
service/savor-service created
[root@master1 ~]# kubectl get pod -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
savor-deploy-74d68b85dd-9d5lk 1/1 Running 0 38s 10.224.135.20 node3 <none> <none>
savor-deploy-74d68b85dd-thjgg 1/1 Running 0 38s 10.224.166.181 node1 <none> <none>
savor-deploy-74d68b85dd-tmpf6 1/1 Running 0 38s 10.224.104.14 node2 <none> <none>
[root@master1 ~]# kubectl get deploy
NAME READY UP-TO-DATE AVAILABLE AGE
savor-deploy 3/3 3 3 66s
[root@master1 ~]# kubectl get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 40d
savor-service NodePort 10.96.101.254 <none> 9000:30000/TCP 70s
[root@master1 ~]# kubectl get rs
NAME DESIRED CURRENT READY AGE
savor-deploy-74d68b85dd 3 3 3 2m18s
# 创建HPA,平均CPU使用率超过50%时,自动扩展Pod的数量,最小为3个Pod,最大为6个Pod
[root@master1 ~]# kubectl autoscale deployment savor-deploy --cpu-percent=50 --min=3 --max=6
horizontalpodautoscaler.autoscaling/savor-deploy autoscaled
[root@master1 ~]# kubectl get hpa
NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS AGE
savor-deploy Deployment/savor-deploy 0%/50% 3 6 3 45s
前端界面挂载
#编写yaml
[root@master1 ~]# cat nginx-ingress-web-savor.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx-savor-deployment
spec:
replicas: 3
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
app: nginx
spec:
containers:
- name: nginx-container
imagePullPolicy: IfNotPresent
image: nginx:latest
ports:
- containerPort: 80
volumeMounts:
- name: nginx-volume
mountPath: /usr/share/nginx/html
volumes:
- name: nginx-volume
persistentVolumeClaim:
claimName: pvc-nfs-web
---
apiVersion: v1
kind: Service
metadata:
name: nginx-savor-service
spec:
selector:
app: nginx
ports:
- protocol: TCP
port: 80 # Service暴露的端口
targetPort: 80 # Pod内部服务监听的端口
# nodePort: 30002 # 指定节点端口(可选)
name: http # 给端口起名字,方便在Ingress中引用
type: NodePort # NodePort类型,使服务在每个节点的指定端口上可用
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: nginx-savor-ingress
spec:
ingressClassName: nginx # 使用新的字段来指定Ingress控制器类
rules:
- host: www.savor.com # 替换为你的实际域名
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: nginx-savor-service
port:
name: http # 或者使用 number: 80
[root@master1 ~]# kubectl apply -f nginx-ingress-web-savor.yaml
deployment.apps/nginx-savor-deployment created
service/nginx-savor-service configured
ingress.networking.k8s.io/nginx-savor-ingress created
[root@master1 ~]# kubectl get pod
NAME READY STATUS RESTARTS AGE
nginx-savor-deployment-b495bcf58-gf9z4 1/1 Running 0 3m22s
nginx-savor-deployment-b495bcf58-n98qq 1/1 Running 0 3m22s
[root@master1 ~]# kubectl get ingress
NAME CLASS HOSTS ADDRESS PORTS AGE
nginx-savor-ingress nginx www.savor.com 80 3m8s
# 浏览器访问http://192.168.136.161:30002/
更多推荐
所有评论(0)