K8S 笔记(启用gpu支持)——筑梦之路
k8s 搭建安装2021-4-13https://blog.csdn.net/fulin9452/article/details/104981558?utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~aggregatepage~first_rank_v2~rank_aggregation-11-104981558.pc_
k8s 搭建安装
2021-4-13
https://blog.csdn.net/fulin9452/article/details/104981558?utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~aggregatepage~first_rank_v2~rank_aggregation-11-104981558.pc_agg_rank_aggregation&utm_term=gpu+%E4%B8%AA%E4%BA%BA+%E6%90%AD%E5%BB%BA&spm=1000.2123.3001.4430
显卡 gtx 1660
操作系统:ubuntu server 16.04
docker版本 19.03
cuda 10.2
cudnn 8.0
显卡驱动:440.100
#禁用默认的显卡驱动:
vim /etc/modprobe.d/blacklist-nouveau.conf
blacklist nouveau
blacklist lbm-nouveau
options nouveau modeset=0
alias nouveau off
alias lbm-nouveau off
sudo update-initramfs -u
reboot #重启
lsmod |grep nouveau
cp cuda/include/cudnn.h /usr/local/cuda/include/
cp cuda/lib64/libcudnn* /usr/local/cuda/lib64/
chmod a+r /usr/local/cuda/include/cudnn.h
chmod a+r /usr/local/cuda/lib64/libcudnn*
安装docker-ce 阿里源
apt-get update
apt-get -y install apt-transport-https ca-certificates curl software-properties-common
curl -fsSL http://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -
add-apt-repository "deb [arch=amd64] http://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable"
apt-get -y update
apt-get -y install docker-ce
将当前非root用户加入docker组:
sudo groupadd docker #添加docker用户组
sudo gpasswd -a $USER docker #将登陆用户加入到docker用户组中
newgrp docker #更新用户组
docker ps #测试docker命令是否可以使用sudo正常使用
安装指定版本的docker-ce:
apt-cache madison docker-ce
apt-get install docker-ce=18.03.0~ce-0~ubuntu
vim /lib/systemd/system/docker.service
ExecStart=/usr/bin/dockerd -H fd:// --default-runtime=nvidia
/etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
k8s安装
curl https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add -
cat /etc/apt/sources.list.d/kubernetes.list
deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main
apt-get update
apt-get install -y kubelet kubeadm kubectl
禁用swap
swapoff -a
free -h
查看版本
kubelet --version
下载所需的镜像
kubeadm config images list --kubernetes-version=v1.21.0
k8s.gcr.io/kube-apiserver:v1.21.0
k8s.gcr.io/kube-controller-manager:v1.21.0
k8s.gcr.io/kube-scheduler:v1.21.0
k8s.gcr.io/kube-proxy:v1.21.0
k8s.gcr.io/pause:3.4.1
k8s.gcr.io/etcd:3.4.13-0
k8s.gcr.io/coredns/coredns:v1.8.0
脚本拉取:
k8s_get.sh
#!/bin/bash
KUBE_VERSION=v1.21.0
KUBE_PAUSE_VERSION=3.4.1
ETCD_VERSION=3.4.13-0
DNS_VERSION=1.8.0
username=registry.cn-hangzhou.aliyuncs.com/google_containers
images=(kube-proxy-amd64:${KUBE_VERSION}
kube-scheduler-amd64:${KUBE_VERSION}
kube-controller-manager-amd64:${KUBE_VERSION}
kube-apiserver-amd64:${KUBE_VERSION}
pause:${KUBE_PAUSE_VERSION}
etcd-amd64:${ETCD_VERSION}
coredns:${DNS_VERSION}
)
for image in ${images[@]}
do
docker pull ${username}/${image}
docker tag ${username}/${image} k8s.gcr.io/${image}
#docker tag ${username}/${image} gcr.io/google_containers/${image}
docker rmi ${username}/${image}
done
有问题的镜像需要手动修正
https://blog.csdn.net/networken/article/details/84571373
初始化k8s:
kubeadm init --apiserver-advertise-address=192.168.30.18 --pod-network-cidr=10.244.0.0/16 --kubernetes-version=v1.21.0
kubeadm join 192.168.30.18:6443 --token l0xhsc.9yczj08noozkaq1w \
--discovery-token-ca-cert-hash sha256:68a4dd69261c79f9885e3c31650580967346bd291bc15c24909a50c46bce1bfc
##########################
Your Kubernetes control-plane has initialized successfully!
To start using your cluster, you need to run the following as a regular user:
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
Alternatively, if you are the root user, you can run:
export KUBECONFIG=/etc/kubernetes/admin.conf
You should now deploy a pod network to the cluster.
Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at:
https://kubernetes.io/docs/concepts/cluster-administration/addons/
Then you can join any number of worker nodes by running the following on each as root:
kubeadm join 192.168.30.18:6443 --token kf2mr2.s2wz5q1vyr4nyom0 \
--discovery-token-ca-cert-hash sha256:b78d180c56b7ace538f57f66099424c8f05a70da087d75b6f13059f87840db7c
############################
export KUBECONFIG=/etc/kubernetes/admin.conf
kubectl get nodes
创建fannel网络
kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
查看pod情况:
kubectl get nodes
kubectl get pods --all-namespaces
kubectl get pods -n kube-system
kubectl describe node
##################################################
通过yaml文件安装dashboard
wget https://raw.githubusercontent.com/kubernetes/dashboard/master/aio/deploy/recommended/kubernetes-dashboard.yaml
kubectl create -f kubernetes-dashboard.yaml
kubectl apply -f http://mirror.faasx.com/kubernetes/dashboard/master/src/deploy/recommended/kubernetes-dashboard.yaml
kubectl --namespace=kube-system get deployment kubernetes-dashboard
kubectl --namespace=kube-system get service kubernetes-dashboard
kubectl proxy --address='0.0.0.0' --accept-hosts='^*$'
##
kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.1.0/aio/deploy/recommended.yaml
kubectl get pods -n kubernetes-dashboard
kubectl get svc -n kubernetes-dashboard -o wide
kubectl get pods -n kube-system
kubectl get svc,pod -n kube-system
#修改service type类型变成NodePort, 把type: ClusterIP变成 type: NodePort,保存退出即可
kubectl edit svc kubernetes-dashboard -n kubernetes-dashboard
#通过yaml文件里指定的默认的token登陆dashboard
kubectl get secret -n kubernetes-dashboard
找到对应的带有token的kubernetes-dashboard-token-ngcmg
kubectl describe secret kubernetes-dashboard-token-nv47b -n kubernetes-dashboard
#创建服务账户
vim admin-user.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: admin-user
namespace: kube-system
kubectl create -f admin-user.yaml
#绑定角色
vim admin-user-role-binding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: admin-user
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: admin-user
namespace: kube-system
kubectl create -f admin-user-role-binding.yaml
#获取token
kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep admin-user | awk '{print $1}')
这里使用token:kf2mr2.s2wz5q1vyr4nyom0
https://192.168.30.18:31331
…#################
https://www.cnblogs.com/double-dong/p/11483670.html
######################
使用账号密码方式
echo "admin,admin,1" > /etc/kubernetes/pki/basic_auth_file
https://www.cnblogs.com/wenyang321/p/14149099.html
- --basic-auth-file=/etc/kubernetes/pki/basic_auth_file
kubectl get pod -n kube-system | grep api-server
kubectl create clusterrolebinding login-on-dashboard-with-cluster-admin --clusterrole=cluster-admin --user=admin
kubectl get clusterrolebinding login-on-dashboard-with-cluster-admin
####################################################
wget https://k8s-1252147235.cos.ap-chengdu.myqcloud.com/dashboard/dashboard.yaml
kubectl get pod -n kube-system
kubectl get pod,svc -n kube-system
https://192.168.30.18:30001
绑定角色:
kubectl create serviceaccount dashboard-admin -n kube-system
kubectl create clusterrolebinding dashboard-admin --clusterrole=cluster-admin --serviceaccount=kube-system:dashboard
kubectl describe secrets -n kube-system $(kubectl -n kube-system get secret | awk '/dashboard-admin/{print $1}')
启用GPU支持:
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.9.0/nvidia-device-plugin.yml
####报错信息
Unauthorized (401): Invalid credentials provided
openssl pkcs12 -export -out admin.pfx -inkey k8s_apiserver_admin-key.pem \
-in k8s_apiserver_admin.pem -certfile k8s-ca.pem
#admin.pfx 生成的客户端证书
#k8s_apiserver_admin-key.pem apiserver服务使用的秘钥
#k8s_apiserver_admin.pem apiserver服务使用的证书
#k8s-ca.pem kubernetes集群使用的CA证书
##api 6443 报错信息 403
## http://blog.leanote.com/post/criss/K8S%E4%B9%8BAPI%E8%AE%BF%E9%97%AE
grep 'client-certificate-data' /etc/kubernetes/admin.conf | head -n 1 | awk '{print $2}' | base64 -d >> kubecfg.crt
生成client-certificate-data
grep 'client-certificate-data' ~/.kube/config | head -n 1 | awk '{print $2}' | base64 -d >> kubecfg.crt
# 生成client-key-data
grep 'client-key-data' /etc/kubernetes/admin.conf | head -n 1 | awk '{print $2}' | base64 -d >> kubecfg.key
# 生成p12
openssl pkcs12 -export -clcerts -inkey kubecfg.key -in kubecfg.crt -out kubecfg.p12 -name "kubernetes-client"
#解除限制
kubectl taint node k8s-master node-role.kubernetes.io/master-
或者
docker pull siriuszg/kubernetes-dashboard-amd64:v1.10.1
docker tag siriuszg/kubernetes-dashboard-amd64:v1.10.1 k8s.gcr.io/kubernetes-dashboard:v1.10.1
部署
kubectl create -f kubernetes-dashboard.yaml
kubectl get pods -n kube-system 检查状态。
#######################################
编写应用yaml
#######################################
K8S GPU共享
https://blog.csdn.net/yunqiinsight/article/details/87694685
https://github.com/AliyunContainerService/gpushare-scheduler-extender/blob/master/docs/userguide.md
更多推荐
所有评论(0)