接上一篇 快速搭建 Windows Kubernetes , 我们发现原来在 Windows Kubernetes 会有一些与在 Linux 上使用不一样的体验,俗称坑,例如 hostAliases。对于我们希望真的把 Windows 放入生产,感觉除了基本的 Pod、Volume、Service 、Log 以外,我们还需要监控。一般来讲我们会用 Prometheus 来做监控,然后通过 Grafana 来展示,但是 Prometheus 的 Node Exporter 是为 *nix 设计的,所以在 Windows 上我们的自己想办法了。在 Prometheus Node Exporter 里推荐使用 WMI exporter ,感兴趣的童鞋可以去试试,本文主要还是想从一个原始的角度去分析处理,来理解怎么去写一个 Prometheus 的采集程序。

  • 一套 Windows Kuberentes
  • 一个 Prometheus 环境
  • 首先得找到 Kubelet 在 Windows 上暴露出来得数据格式, 因为 cadivsor 并不支持 Windows, 社区有位同志写了一个相对简单的实现来支持; 他这个的实现还是保持 Linux 上的一样,是从 <Node_IP>:10255/stats/summary上 expose metrics, metrics-server 与 kubectl top的数据也是来源于此,大致如下:
  1. {
  2. "node": {
  3. "nodeName": "35598k8s9001",
  4. "startTime": "2018-08-26T07:25:08Z",
  5. "cpu": {
  6. "time": "2018-09-10T01:44:52Z",
  7. "usageCoreNanoSeconds": 8532520000000
  8. },
  9. "memory": {
  10. "time": "2018-09-10T01:44:52Z",
  11. "availableBytes": 14297423872,
  12. "usageBytes": 1978798080,
  13. "workingSetBytes": 734490624,
  14. "rssBytes": 0,
  15. "pageFaults": 0,
  16. "majorPageFaults": 0
  17. },
  18. "fs": {
  19. "time": "2018-09-10T01:44:52Z",
  20. "availableBytes": 15829303296,
  21. "capacityBytes": 32212250624,
  22. "usedBytes": 16382947328
  23. },
  24. "runtime": {
  25. "imageFs": {
  26. "time": "2018-09-10T01:44:53Z",
  27. "availableBytes": 15829303296,
  28. "capacityBytes": 32212250624,
  29. "usedBytes": 16382947328,
  30. "inodesUsed": 0
  31. }
  32. }
  33. },
  34. "pods": [
  35. {
  36. "podRef": {
  37. "name": "stdlogserverwin-5fbcc5648d-ztqsq",
  38. "namespace": "default",
  39. "uid": "f461a0b4-ab36-11e8-93c4-0017fa0362de"
  40. },
  41. "startTime": "2018-08-29T02:55:15Z",
  42. "containers": [
  43. {
  44. "name": "stdlogserverwin",
  45. "startTime": "2018-08-29T02:56:24Z",
  46. "cpu": {
  47. "time": "2018-09-10T01:44:54Z",
  48. "usageCoreNanoSeconds": 749578125000
  49. },
  50. "memory": {
  51. "time": "2018-09-10T01:44:54Z",
  52. "workingSetBytes": 83255296
  53. },
  54. "rootfs": {
  55. "time": "2018-09-10T01:44:54Z",
  56. "availableBytes": 15829303296,
  57. "capacityBytes": 32212250624,
  58. "usedBytes": 0
  59. },
  60. "logs": {
  61. "time": "2018-09-10T01:44:53Z",
  62. "availableBytes": 15829303296,
  63. "capacityBytes": 32212250624,
  64. "usedBytes": 16382947328,
  65. "inodesUsed": 0
  66. },
  67. "userDefinedMetrics": null
  68. }
  69. ],
  70. "cpu": {
  71. "time": "2018-08-29T02:56:24Z",
  72. "usageNanoCores": 0,
  73. "usageCoreNanoSeconds": 749578125000
  74. },
  75. "memory": {
  76. "time": "2018-09-10T01:44:54Z",
  77. "availableBytes": 0,
  78. "usageBytes": 0,
  79. "workingSetBytes": 83255296,
  80. "rssBytes": 0,
  81. "pageFaults": 0,
  82. "majorPageFaults": 0
  83. },
  84. "volume": [
  85. {
  86. "time": "2018-08-29T02:55:16Z",
  87. "availableBytes": 17378648064,
  88. "capacityBytes": 32212250624,
  89. "usedBytes": 14833602560,
  90. "inodesFree": 0,
  91. "inodes": 0,
  92. "inodesUsed": 0,
  93. "name": "default-token-wv5fc"
  94. }
  95. ],
  96. "ephemeral-storage": {
  97. "time": "2018-09-10T01:44:54Z",
  98. "availableBytes": 15829303296,
  99. "capacityBytes": 32212250624,
  100. "usedBytes": 16382947328
  101. }
  102. }
  103. ]
  104. }
  • 从上面可以看到,它包含了本机和 pod 的一些 metrics, 相对 cadvisor 能提供的少了一些,但是基本监控是没问题的。接下来我们需要写一个小程序把数据转换成 Prometheus 能解析的数据。接下来用 python 写个小栗子, 先声明下我们要 expose 的 stats 对象
  1. class Node:
  2. def __init__(self, name, cpu, memory):
  3. self.name = name
  4. self.cpu = cpu
  5. self.memory = memory
  6. class Pod:
  7. def __init__(self, name, namespace,cpu, memory):
  8. self.name = name
  9. self.namespace = namespace
  10. self.cpu = cpu
  11. self.memory = memory
  12. class Stats:
  13. def __init__(self, node, pods):
  14. self.node = node
  15. self.pods = pods
  • 使用 Prometheus 的 python-client 来写一个 polling 的程序,去转换 kubelet stats 数据。
  1. from urllib.request import urlopen
  2. from stats import Node
  3. from stats import Pod
  4. from stats import Stats
  5. import json
  6. import asyncio
  7. import prometheus_client as prom
  8. import logging
  9. import random
  10. def getMetrics(url):
  11. #获取数据集
  12. response = urlopen(url)
  13. string = response.read().decode('utf-8')
  14. json_obj = json.loads(string)
  15. #用之前定义好的 stats 的对象来做 mapping
  16. node = Node('','','')
  17. node.name = json_obj['node']['nodeName']
  18. node.cpu = json_obj['node']['cpu']['usageCoreNanoSeconds']
  19. node.memory = json_obj['node']['memory']['usageBytes']
  20. pods_array = json_obj['pods']
  21. pods_list = []
  22. for item in pods_array:
  23. pod = Pod('','','','')
  24. pod.name = item['podRef']['name']
  25. pod.namespace = item['podRef']['namespace']
  26. pod.cpu = item['cpu']['usageCoreNanoSeconds']
  27. pod.memory = item['memory']['workingSetBytes']
  28. pods_list.append(pod)
  29. stats = Stats('','')
  30. stats.node = node
  31. stats.pods = pods_list
  32. return stats
  33. #写个简单的日志输出格式
  34. format = "%(asctime)s - %(levelname)s [%(name)s] %(threadName)s %(message)s"
  35. logging.basicConfig(level=logging.INFO, format=format)
  36. #声明我们需要导出的 metrics 及对应的 label 供未来查询使用
  37. g1 = prom.Gauge('node_cpu_usageCoreNanoSeconds', 'CPU useage of the node', labelnames=['node_name'])
  38. g2 = prom.Gauge('node_mem_usageBytes', 'Memory useage of the node', labelnames=['node_name'])
  39. g3 = prom.Gauge('pod_cpu_usageCoreNanoSeconds', 'Memory useage of the node', labelnames=['pod_name','pod_namespace'])
  40. g4 = prom.Gauge('pod_mem_usageBytes', 'Memory useage of the node', labelnames=['pod_name','pod_namespace'])
  41. async def expose_stats(url):
  42. while True:
  43. stats = getMetrics(url)
  44. #以打印 node 本身的监控信息为例
  45. logging.info("nodename: {} value {}".format(stats.node.name, stats.node.cpu))
  46. # 为当前要 poll 的 metrics 赋值
  47. g1.labels(node_name=stats.node.name).set(stats.node.cpu)
  48. g2.labels(node_name=stats.node.name).set(stats.node.memory)
  49. pods_array = stats.pods
  50. for item in pods_array:
  51. g3.labels(pod_name=item.name,pod_namespace=item.namespace).set(item.memory)
  52. g4.labels(pod_name=item.name,pod_namespace=item.namespace).set(item.cpu)
  53. await asyncio.sleep(1)
  54. if __name__ == '__main__':
  55. loop = asyncio.get_event_loop()
  56. # 启动一个 http server 来做 polling
  57. prom.start_http_server(8000)
  58. t0_value = 50
  59. #可以在每一台 Windows 机器上都启动一个这样的程序,也可以远程部署脚本来做 exposing
  60. url = 'http://localhost:10255/stats/summary'
  61. tasks = [loop.create_task(expose_stats(url))]
  62. try:
  63. loop.run_forever()
  64. except KeyboardInterrupt:
  65. pass
  66. finally:
  67. loop.close()
  • 写完以后就可以启动这个程序了,访问他的 8000 端口就能看到相关的数据

  • 接下来需要在 prometheus 里加入配置,增加一个收集对象,如下例:
  1. - job_name: python_app
  2. scrape_interval: 15s
  3. scrape_timeout: 10s
  4. metrics_path: /
  5. scheme: http
  6. static_configs:
  7. - targets:
  8. - localhost:8000
  • 这样在 Prometheus 的页面上能查询到相关的信息了

版权声明:本文为bigdaddyblog原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/bigdaddyblog/p/9719878.html