在kubernetes
集群中部署metrics-server后可以通过kubectl top <pod/node>
可以查看pod
或node
的资源使用情况,它的原理是什么我们来分析下,在分析metrics-server
之前我们先来看下apiserver-aggregation。
本文基于:metrics-server-0.5.2
分析。
apiserver aggregation
kube-apiserver
的聚合机制允许通过开发自定义apiserver来扩展,Aggregation充当代理层,资源的请求会根据注册的路径代理到对应的apiserver上,这个注册的过程在kubernetes中一个专门的资源名叫apiservices
,使用kubectl get apiservices
可以查看当前集群中的全部apiservices
1
2
3
4
5
6
7
8
9
10
11
|
[dev@k8s]$ kubectl get apiservices
NAME SERVICE AVAILABLE AGE
v1. Local True 2y31d
v1.admissionregistration.k8s.io Local True 58d
v1.apiextensions.k8s.io Local True 58d
v1.apps Local True 2y31d
v1.authentication.k8s.io Local True 2y31d
v1.authorization.k8s.io Local True 2y31d
v1.autoscaling Local True 2y31d
v1.batch Local True 2y31d
...
|
当部署metrics-server
完以后会在集群中创建一个名为v1beta1.metrics.k8s.io
的apiservice
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
[dev@172.16.8.7_server0 k8s]$ kubectl get apiservices v1beta1.metrics.k8s.io -o yaml
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
creationTimestamp: "2019-12-13T09:36:38Z"
name: v1beta1.metrics.k8s.io
resourceVersion: "4621101195"
uid: 0fa09ed2-1d8c-11ea-941e-fa4055edae04
spec:
group: metrics.k8s.io
groupPriorityMinimum: 100
insecureSkipTLSVerify: true
service:
name: metrics-server
namespace: kube-system
port: 443
version: v1beta1
versionPriority: 100
status:
conditions:
- lastTransitionTime: "2021-12-27T06:33:42Z"
message: all checks passed
reason: Passed
status: "True"
type: Available
|
其中几个比要重要的字段:
- group:api 组的名字
- service: 指定apiserver运行所在的服务,
kubernetes
内置对象都会标记为Local
metrics-server
metrics-server
一方面会调用kubelet
暴露的metrics接口获取当前的资源使用情况,另一方面会启动一apiserver供kubectl top等客户端调用。
Metrics获取
metrics-server通过kubelet
/stats/summary?only_cpu_and_memory=true接口获取node或pod的资源使用状态,具体逻辑在这个函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
func (kc *kubeletClient) GetMetrics(ctx context.Context, node *corev1.Node) (*storage.MetricsBatch, error) {
port := kc.defaultPort
nodeStatusPort := int(node.Status.DaemonEndpoints.KubeletEndpoint.Port)
if kc.useNodeStatusPort && nodeStatusPort != 0 {
port = nodeStatusPort
}
addr, err := kc.addrResolver.NodeAddress(node)
if err != nil {
return nil, err
}
url := url.URL{
Scheme: kc.scheme,
Host: net.JoinHostPort(addr, strconv.Itoa(port)),
Path: "/stats/summary",
RawQuery: "only_cpu_and_memory=true",
}
req, err := http.NewRequest("GET", url.String(), nil)
if err != nil {
return nil, err
}
summary := &Summary{}
client := kc.client
if client == nil {
client = http.DefaultClient
}
err = kc.makeRequestAndGetValue(client, req.WithContext(ctx), summary)
return decodeBatch(summary), err
}
|
在kubelet节点上可以通过curl 127.0.0.1:10255/stats/summary?only_cpu_and_memory=true
查看资源使用情况。
以后可能会切换到通过/metrics/resource
接口获取资源使用状态信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
// GetMetrics implements client.KubeletMetricsGetter
func (kc *kubeletClient) GetMetrics(ctx context.Context, node *corev1.Node) (*storage.MetricsBatch, error) {
port := kc.defaultPort
nodeStatusPort := int(node.Status.DaemonEndpoints.KubeletEndpoint.Port)
if kc.useNodeStatusPort && nodeStatusPort != 0 {
port = nodeStatusPort
}
addr, err := kc.addrResolver.NodeAddress(node)
if err != nil {
return nil, err
}
url := url.URL{
Scheme: kc.scheme,
Host: net.JoinHostPort(addr, strconv.Itoa(port)),
Path: "/metrics/resource",
}
return kc.getMetrics(ctx, url.String(), node.Name)
}
|
apiserver
apiserver会返回两个资源对象
集群中通过
kubectl get nodes.v1beta1.metrics.k8s.io
kubectl get pods.v1beta1.metrics.k8s.io
具体逻辑
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
func (m *nodeMetrics) Get(ctx context.Context, name string, opts *metav1.GetOptions) (runtime.Object, error) {
node, err := m.nodeLister.Get(name)
if err != nil {
if errors.IsNotFound(err) {
// return not-found errors directly
return nil, err
}
klog.ErrorS(err, "Failed getting node", "node", klog.KRef("", name))
return nil, fmt.Errorf("failed getting node: %w", err)
}
if node == nil {
return nil, errors.NewNotFound(m.groupResource, name)
}
nodeMetrics, err := m.getNodeMetrics(node)
if err != nil {
klog.ErrorS(err, "Failed reading node metrics", "node", klog.KRef("", name))
return nil, fmt.Errorf("failed reading node metrics: %w", err)
}
if len(nodeMetrics) == 0 {
return nil, errors.NewNotFound(m.groupResource, name)
}
return &nodeMetrics[0], nil
}
|
参考
https://kubernetes.io/docs/reference/kubernetes-api/cluster-resources/api-service-v1/