k8s节点状态管理源码分析
节点有两种对象来表示其存活状态, node.status和每一个node所对应的lease对象, 这两种对象都由kubelet来更新:当节点状态发生变化,或者没有变化但是超过了配置的时间间隔,kubelet 会更新.status。.status更新的默认间隔为 5 分钟(比节点不可达事件的 40 秒默认超时时间长很多)。kubelet会创建并每 10 秒(默认更新间隔时间)更新 Lease 对象。
源码位置
pkg/controller/nodelifecycle/node_lifecycle_controller.go
// tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to
// which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) {
/*
nodeHealth定义如下
type nodeHealthData struct {
// 最后一次探测时间
probeTimestamp metav1.Time
// 最后一次节点由一个状态转换为另一个状态时间
readyTransitionTimestamp metav1.Time
// 节点status
status *v1.NodeStatus
// 节点对应的lease信息
lease *coordv1.Lease
}
*/
nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name)
defer func() {
// 各种条件判断完毕后,再最后将health信息赋值给该节点
nc.nodeHealthMap.set(node.Name, nodeHealth)
}()
// 节点启动宽限时间, 多长时间检测一次节点状态
var gracePeriod time.Duration
/*
type NodeCondition struct {
// 状态类型, Ready / MemoryPressure / DiskPressure ..
Type NodeConditionType `json:"type" protobuf:"bytes,1,opt,name=type,casttype=NodeConditionType"`
// 状态 one of True, False, Unknown.
Status ConditionStatus `json:"status" protobuf:"bytes,2,opt,name=status,casttype=ConditionStatus"`
// 最后一次心跳时间
LastHeartbeatTime metav1.Time `json:"lastHeartbeatTime,omitempty" protobuf:"bytes,3,opt,name=lastHeartbeatTime"`
// 最后一次节点由一个状态转换为另一个状态时间
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" protobuf:"bytes,4,opt,name=lastTransitionTime"`
// 状态转换的原因
Reason string `json:"reason,omitempty" protobuf:"bytes,5,opt,name=reason"`
// 人类可读的状态转换原因
Message string `json:"message,omitempty" protobuf:"bytes,6,opt,name=message"`
}
*/
var observedReadyCondition v1.NodeCondition
// currentReadyCondition: 也是v1.NodeCondition类型
// 获取到状态类型为Ready, currentReadyCondition才不为空
_, currentReadyCondition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady)
// 当前节点没有状态类型为Ready的condition, 每个节点由多个类型的condition, 比如还有MemoryPressure condition
if currentReadyCondition == nil {
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
// A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set
// to node.CreationTimestamp to avoid handle the corner case.
// 新的type为Ready的状态, 设置为Unknown
observedReadyCondition = v1.NodeCondition{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
// 第一次设置, 所以最后一次心跳时间设置为节点创建时间
LastHeartbeatTime: node.CreationTimestamp,
// 最后一次状态转换时间设置为节点创建时间
LastTransitionTime: node.CreationTimestamp,
}
// 节点启动宽限时间, 第一次会设置得长一些, 等待节点启动
gracePeriod = nc.nodeStartupGracePeriod
// 更新nodeHealth
if nodeHealth != nil {
nodeHealth.status = &node.Status
} else {
// 创建节点health
nodeHealth = &nodeHealthData{
status: &node.Status,
// 探测时间戳设置为节点创建时间
probeTimestamp: node.CreationTimestamp,
// 状态转换成功时间戳设置为节点创建时间
readyTransitionTimestamp: node.CreationTimestamp,
}
}
// 此条件下执行到此处, currentReadyCondition == nil, nodeHealth != nil
} else {
// 非第一次
// If ready condition is not nil, make a copy of it, since we may modify it in place later.
// 新的type为Ready的状态设置为当前获取的新的状态
observedReadyCondition = *currentReadyCondition
// 设置为节点监控宽限期, 与第一次启动不一样
gracePeriod = nc.nodeMonitorGracePeriod
}
// There are following cases to check:
// - both saved and new status have no Ready Condition set - we leave everything as it is,
// - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd,
// - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do,
// - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be
// unresponsive, so we leave it as it is,
// - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State -
// everything's in order, no transition occurred, we update only probeTimestamp,
// - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State -
// Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp.
// TODO: things to consider:
// - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it,
// - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check
// if that's the case, but it does not seem necessary.
// 保存上一次节点状态
var savedCondition *v1.NodeCondition
// 保存的Lease信息
/*
type Lease struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
Spec LeaseSpec `json:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}
*/
var savedLease *coordv1.Lease
if nodeHealth != nil {
// 上一次节点状态
_, savedCondition = controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady)
// 上一次lease状态
savedLease = nodeHealth.lease
}
// 没有nodeHealth对象则新建
if nodeHealth == nil {
klog.Warningf("Missing timestamp for Node %s. Assuming now as a timestamp.", node.Name)
// 新建nodeHealth
nodeHealth = &nodeHealthData{
// 设置为节点当前状态
status: &node.Status,
// 都更新为现在
probeTimestamp: nc.now(),
readyTransitionTimestamp: nc.now(),
}
// 之前没有有Ready类型的condition, 现在有了
} else if savedCondition == nil && currentReadyCondition != nil {
klog.V(1).Infof("Creating timestamp entry for newly observed Node %s", node.Name)
nodeHealth = &nodeHealthData{
status: &node.Status,
probeTimestamp: nc.now(),
readyTransitionTimestamp: nc.now(),
}
// 之前有Ready类型的condition, 现在没了
} else if savedCondition != nil && currentReadyCondition == nil {
klog.Errorf("ReadyCondition was removed from Status of Node %s", node.Name)
// TODO: figure out what to do in this case. For now we do the same thing as above.
nodeHealth = &nodeHealthData{
status: &node.Status,
probeTimestamp: nc.now(),
readyTransitionTimestamp: nc.now(),
}
// 之前没有, 现在有, 并且之前的最后一次心跳时间不等于当前状态的心跳时间
} else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime {
var transitionTime metav1.Time
// If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now",
// otherwise we leave it as it is.
// 之前的最后一次状态转换成功时间不等于当前状态转换成功时间
if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime {
klog.V(3).Infof("ReadyCondition for Node %s transitioned from %v to %v", node.Name, savedCondition, currentReadyCondition)
// 状态转换时间设置为现在
transitionTime = nc.now()
// 之前的最后一次状态转换成功时间等于当前状态转换成功时间, 说明在当前探测周期内状态没有发生变化
} else {
// 设置为当前状态转换成功时间
transitionTime = nodeHealth.readyTransitionTimestamp
}
if klogV := klog.V(5); klogV.Enabled() {
klogV.Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, nodeHealth.status, node.Status)
} else {
klog.V(3).Infof("Node %s ReadyCondition updated. Updating timestamp.", node.Name)
}
nodeHealth = &nodeHealthData{
status: &node.Status,
// 探测时间设置为当前时间
probeTimestamp: nc.now(),
readyTransitionTimestamp: transitionTime,
}
}
// Always update the probe time if node lease is renewed.
// Note: If kubelet never posted the node status, but continues renewing the
// heartbeat leases, the node controller will assume the node is healthy and
// take no action.
// 总是更新状态检测时间只要lease被更新, 如果节点状态没有更新, 但该节点对应的lease对象一直在更新,节点控制器依然认为该节点正常
// 获取当前状态的lease对象
observedLease, _ := nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name)
if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) {
// 更新健康对象的lease为当前获取的节点lease
nodeHealth.lease = observedLease
// 更新检测时间为当前时间
nodeHealth.probeTimestamp = nc.now()
}
// 如果当前时间 > 上一次更新的时间 + 探测宽限时间, 即超过了探测宽限时间未更新, 节点状态设置未Unknown
if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) {
// NodeReady condition or lease was last set longer ago than gracePeriod, so
// update it to Unknown (regardless of its current value) in the master.
nodeConditionTypes := []v1.NodeConditionType{
v1.NodeReady,
v1.NodeMemoryPressure,
v1.NodeDiskPressure,
v1.NodePIDPressure,
// We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level.
// v1.NodeNetworkUnavailable,
}
nowTimestamp := nc.now()
// 把每种类型的状态都更新未unknown
for _, nodeConditionType := range nodeConditionTypes {
_, currentCondition := controllerutil.GetNodeCondition(&node.Status, nodeConditionType)
if currentCondition == nil {
klog.V(2).Infof("Condition %v of node %v was never updated by kubelet", nodeConditionType, node.Name)
node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
Type: nodeConditionType,
Status: v1.ConditionUnknown,
Reason: "NodeStatusNeverUpdated",
Message: "Kubelet never posted node status.",
LastHeartbeatTime: node.CreationTimestamp,
LastTransitionTime: nowTimestamp,
})
} else {
klog.V(2).Infof("node %v hasn't been updated for %+v. Last %v is: %+v",
node.Name, nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), nodeConditionType, currentCondition)
if currentCondition.Status != v1.ConditionUnknown {
currentCondition.Status = v1.ConditionUnknown
currentCondition.Reason = "NodeStatusUnknown"
currentCondition.Message = "Kubelet stopped posting node status."
currentCondition.LastTransitionTime = nowTimestamp
}
}
}
// We need to update currentReadyCondition due to its value potentially changed.
// 当前type为Ready的状态
_, currentReadyCondition = controllerutil.GetNodeCondition(&node.Status, v1.NodeReady)
// 更新状态
if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) {
if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil {
klog.Errorf("Error updating node %s: %v", node.Name, err)
return gracePeriod, observedReadyCondition, currentReadyCondition, err
}
nodeHealth = &nodeHealthData{
status: &node.Status,
probeTimestamp: nodeHealth.probeTimestamp,
readyTransitionTimestamp: nc.now(),
lease: observedLease,
}
return gracePeriod, observedReadyCondition, currentReadyCondition, nil
}
}
return gracePeriod, observedReadyCondition, currentReadyCondition, nil
}
可配置参数
nodeStartupGracePeriod
nodeStartupGracePeriod is the amount of time which we allow starting a node to be unresponsive before marking it unhealthy.
启动节点时, 多长时间无响应, 则标记该节点为unhealthy, 默认一分钟
在kube-controller-manager启动参数中可配置
--node-startup-grace-period=2m
nodeMontiorGracePeriod
nodeMontiorGracePeriod is the amount of time which we allow a running node to be unresponsive before marking it unhealthy. Must be N times more than kubelet's nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet to post node status.
允许一个已经运行的节点无响应多长时间, 超过了则标记其为unhealthy, 必须时kubelet探测节点状态频率的N倍, 不然会错乱, 默认40秒
在kube-controller-manager启动参数中可配置
--node-monitor-grace-period=40s
nodeMonitorPeriod
节点控制器探测节点状态频率, 默认5秒一次
在kube-controller-manager启动参数中可配置
--node-monitor-period=5s
总结
节点有两种对象来表示其存活状态, node.status和每一个node所对应的lease对象, 节点控制器(node_lifecycle_controller), 也叫节点生命周期控制器定期(默认每5秒一次)检测node.status和lease, 如果 "当前时间 > 上一次leases更新的时间或者node.status更新的时间 + 宽限时间" , 即超过了nodeMontiorGracePeriod配置的时间,则将该节点设置为unhealthy
node.status和lease这两种对象都由kubelet来更新
当节点状态发生变化,或者没有变化但是超过了配置的时间间隔,kubelet 会更新 .status。 .status 更新的默认间隔为 5 分钟(比节点不可达事件的 40 秒默认超时时间长很多)。
kubelet 会创建并每 10 秒(默认更新间隔时间)更新 Lease 对象。 Lease 的更新独立于 Node 的 .status 更新而发生。 如果 Lease 的更新操作失败,kubelet 会采用指数回退机制,从 200 毫秒开始重试, 最长重试间隔为 7 秒钟。
问题
可能会使得节点"名存实亡", 比如当该节点cpu资源不足时(cpu是一种可压缩资源, 不像内存,pid等不可压缩资源, cpu不足只会导致应用变慢, 内存不足会导致程序崩溃,pid不足会导致无法创建新的进程), 其已不能满足负载的正常运行, 但按上述机制, 只要在宽限期内上报哪怕一次心跳, 就能续约一个宽限期, 另外在节点被标记为unhealthy后,还需要5分钟(默认)才会开始驱逐第一个pod, 即只要在5分钟 + 宽限时间的时间内至少上报一次心跳, 节点就会一直存活, 该节点上的pod也不会被驱逐, 但其本身已无法正常工作.
解决
给应用组件添加request, limit限制, 避免占用过多资源(配置pod的服务质量)
https://kubernetes.io/zh-cn/docs/tasks/configure-pod-container/quality-service-pod/
给节点预留资源, 防止kubelet无法工作
https://kubernetes.io/zh-cn/docs/tasks/administer-cluster/reserve-compute-resources/
更多推荐
所有评论(0)