summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTobias Henkel <tobias.henkel@bmw.de>2018-03-03 21:30:09 +0000
committerTobias Henkel <tobias.henkel@bmw.de>2018-12-18 22:25:27 +0100
commit145e62b568128b4e58f85bb2d04420f4f7c44a5d (patch)
tree5f977adadc0f3fc76f9e77f2fe484530cf8b5f14
parent5f5032cf820359fb423e417eb12f7c542228b2b0 (diff)
Add cgroup support to ram sensor
When running within k8s the system memory statistics are useless as soon there are configured limits (which is strongly advised). In this case we additionally need to check the cgroups. Change-Id: Idebe5d7e60dc862e89d012594ab362a19f18708d
Notes
Notes (review): Code-Review+2: Joshua Hesketh <josh@nitrotech.org> Code-Review+2: Monty Taylor <mordred@inaugust.com> Workflow+1: Monty Taylor <mordred@inaugust.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Wed, 09 Jan 2019 19:32:24 +0000 Reviewed-on: https://review.openstack.org/549506 Project: openstack-infra/zuul Branch: refs/heads/master
-rw-r--r--doc/source/admin/monitoring.rst6
-rw-r--r--releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml6
-rw-r--r--tests/fixtures/cgroup/memory.stat.bad34
-rw-r--r--tests/fixtures/cgroup/memory.stat.nolimit34
-rw-r--r--tests/fixtures/cgroup/memory.stat.ok34
-rw-r--r--tests/unit/test_executor.py50
-rw-r--r--zuul/executor/sensors/ram.py54
7 files changed, 216 insertions, 2 deletions
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst
index bd47f7b..78da9d3 100644
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@@ -188,6 +188,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
188 The used RAM (excluding buffers and cache) on this executor, as 188 The used RAM (excluding buffers and cache) on this executor, as
189 a percentage multiplied by 100. 189 a percentage multiplied by 100.
190 190
191 .. stat:: pct_used_ram_cgroup
192 :type: gauge
193
194 The used RAM (excluding buffers and cache) on this executor allowed by
195 the cgroup, as percentage multiplied by 100.
196
191.. stat:: zuul.nodepool.requests 197.. stat:: zuul.nodepool.requests
192 198
193 Holds metrics related to Zuul requests and responses from Nodepool. 199 Holds metrics related to Zuul requests and responses from Nodepool.
diff --git a/releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml b/releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml
new file mode 100644
index 0000000..fed40ef
--- /dev/null
+++ b/releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml
@@ -0,0 +1,6 @@
1---
2features:
3 - |
4 The :attr:`executor.min_avail_mem` setting now takes cgroup limits
5 into account. There is also a new metric
6 `zuul.executor.<executor>.pct_used_ram_cgroup` available.
diff --git a/tests/fixtures/cgroup/memory.stat.bad b/tests/fixtures/cgroup/memory.stat.bad
new file mode 100644
index 0000000..2c92310
--- /dev/null
+++ b/tests/fixtures/cgroup/memory.stat.bad
@@ -0,0 +1,34 @@
1cache 0
2rss 561152
3rss_huge 0
4mapped_file 0
5dirty 0
6writeback 0
7swap 0
8pgpgin 654
9pgpgout 517
10pgfault 1089
11pgmajfault 0
12inactive_anon 0
13active_anon 454656
14inactive_file 0
15active_file 0
16unevictable 0
17hierarchical_memory_limit 5368709120
18hierarchical_memsw_limit 5368709120
19total_cache 0
20total_rss 5153960755
21total_rss_huge 0
22total_mapped_file 0
23total_dirty 0
24total_writeback 0
25total_swap 0
26total_pgpgin 654
27total_pgpgout 517
28total_pgfault 1089
29total_pgmajfault 0
30total_inactive_anon 0
31total_active_anon 454656
32total_inactive_file 0
33total_active_file 0
34total_unevictable 0
diff --git a/tests/fixtures/cgroup/memory.stat.nolimit b/tests/fixtures/cgroup/memory.stat.nolimit
new file mode 100644
index 0000000..ed4f378
--- /dev/null
+++ b/tests/fixtures/cgroup/memory.stat.nolimit
@@ -0,0 +1,34 @@
1cache 0
2rss 561152
3rss_huge 0
4mapped_file 0
5dirty 0
6writeback 0
7swap 0
8pgpgin 654
9pgpgout 517
10pgfault 1089
11pgmajfault 0
12inactive_anon 0
13active_anon 454656
14inactive_file 0
15active_file 0
16unevictable 0
17hierarchical_memory_limit 9223372036854771712
18hierarchical_memsw_limit 9223372036854771712
19total_cache 0
20total_rss 561152
21total_rss_huge 0
22total_mapped_file 0
23total_dirty 0
24total_writeback 0
25total_swap 0
26total_pgpgin 654
27total_pgpgout 517
28total_pgfault 1089
29total_pgmajfault 0
30total_inactive_anon 0
31total_active_anon 454656
32total_inactive_file 0
33total_active_file 0
34total_unevictable 0
diff --git a/tests/fixtures/cgroup/memory.stat.ok b/tests/fixtures/cgroup/memory.stat.ok
new file mode 100644
index 0000000..ab62da5
--- /dev/null
+++ b/tests/fixtures/cgroup/memory.stat.ok
@@ -0,0 +1,34 @@
1cache 0
2rss 561152
3rss_huge 0
4mapped_file 0
5dirty 0
6writeback 0
7swap 0
8pgpgin 654
9pgpgout 517
10pgfault 1089
11pgmajfault 0
12inactive_anon 0
13active_anon 454656
14inactive_file 0
15active_file 0
16unevictable 0
17hierarchical_memory_limit 5368709120
18hierarchical_memsw_limit 5368709120
19total_cache 0
20total_rss 1073741824
21total_rss_huge 0
22total_mapped_file 0
23total_dirty 0
24total_writeback 0
25total_swap 0
26total_pgpgin 654
27total_pgpgout 517
28total_pgfault 1089
29total_pgmajfault 0
30total_inactive_anon 0
31total_active_anon 454656
32total_inactive_file 0
33total_active_file 0
34total_unevictable 0
diff --git a/tests/unit/test_executor.py b/tests/unit/test_executor.py
index 2814733..a3dfccc 100644
--- a/tests/unit/test_executor.py
+++ b/tests/unit/test_executor.py
@@ -31,6 +31,7 @@ from tests.base import (
31) 31)
32 32
33from zuul.executor.sensors.startingbuilds import StartingBuildsSensor 33from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
34from zuul.executor.sensors.ram import RAMSensor
34 35
35 36
36class TestExecutorRepos(ZuulTestCase): 37class TestExecutorRepos(ZuulTestCase):
@@ -466,15 +467,62 @@ class TestGovernor(ZuulTestCase):
466 pass 467 pass
467 ram = Dummy() 468 ram = Dummy()
468 ram.percent = 20.0 # 20% used 469 ram.percent = 20.0 # 20% used
470 ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
469 vm_mock.return_value = ram 471 vm_mock.return_value = ram
470 loadavg_mock.return_value = (0.0, 0.0, 0.0) 472 loadavg_mock.return_value = (0.0, 0.0, 0.0)
471 self.executor_server.manageLoad() 473 self.executor_server.manageLoad()
472 self.assertTrue(self.executor_server.accepting_work) 474 self.assertTrue(self.executor_server.accepting_work)
473 ram.percent = 99.0 # 99% used
474 loadavg_mock.return_value = (100.0, 100.0, 100.0) 475 loadavg_mock.return_value = (100.0, 100.0, 100.0)
475 self.executor_server.manageLoad() 476 self.executor_server.manageLoad()
476 self.assertFalse(self.executor_server.accepting_work) 477 self.assertFalse(self.executor_server.accepting_work)
477 478
479 @mock.patch('os.getloadavg')
480 @mock.patch('psutil.virtual_memory')
481 def test_ram_governor(self, vm_mock, loadavg_mock):
482 class Dummy(object):
483 pass
484 ram = Dummy()
485 ram.percent = 20.0 # 20% used
486 ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
487 vm_mock.return_value = ram
488 loadavg_mock.return_value = (0.0, 0.0, 0.0)
489 self.executor_server.manageLoad()
490 self.assertTrue(self.executor_server.accepting_work)
491 ram.percent = 99.0 # 99% used
492 self.executor_server.manageLoad()
493 self.assertFalse(self.executor_server.accepting_work)
494
495 @mock.patch('os.getloadavg')
496 @mock.patch('psutil.virtual_memory')
497 def test_ram_cgroup_governor(self, vm_mock, loadavg_mock):
498 class Dummy(object):
499 pass
500 ram = Dummy()
501 ram.percent = 20.0 # 20% used
502 ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
503 vm_mock.return_value = ram
504 loadavg_mock.return_value = (0.0, 0.0, 0.0)
505
506 # Set no cgroup limit
507 ram_sensor = [x for x in self.executor_server.sensors
508 if isinstance(x, RAMSensor)][0]
509 ram_sensor.cgroup_stats_file = os.path.join(
510 FIXTURE_DIR, 'cgroup', 'memory.stat.nolimit')
511 self.executor_server.manageLoad()
512 self.assertTrue(self.executor_server.accepting_work)
513
514 # Set cgroup limit 5GiB and ram usage 20%
515 ram_sensor.cgroup_stats_file = os.path.join(
516 FIXTURE_DIR, 'cgroup', 'memory.stat.ok')
517 self.executor_server.manageLoad()
518 self.assertTrue(self.executor_server.accepting_work)
519
520 # Set cgroup limit 5GiB and ram usage 96%
521 ram_sensor.cgroup_stats_file = os.path.join(
522 FIXTURE_DIR, 'cgroup', 'memory.stat.bad')
523 self.executor_server.manageLoad()
524 self.assertFalse(self.executor_server.accepting_work)
525
478 @mock.patch('os.statvfs') 526 @mock.patch('os.statvfs')
479 def test_hdd_governor(self, statvfs_mock): 527 def test_hdd_governor(self, statvfs_mock):
480 class Dummy(object): 528 class Dummy(object):
diff --git a/zuul/executor/sensors/ram.py b/zuul/executor/sensors/ram.py
index 9ffbd7b..33ed1a7 100644
--- a/zuul/executor/sensors/ram.py
+++ b/zuul/executor/sensors/ram.py
@@ -13,11 +13,14 @@
13# under the License. 13# under the License.
14 14
15import logging 15import logging
16import math
16import psutil 17import psutil
17 18
18from zuul.executor.sensors import SensorInterface 19from zuul.executor.sensors import SensorInterface
19from zuul.lib.config import get_default 20from zuul.lib.config import get_default
20 21
22CGROUP_STATS_FILE = '/sys/fs/cgroup/memory/memory.stat'
23
21 24
22def get_avail_mem_pct(): 25def get_avail_mem_pct():
23 avail_mem_pct = 100.0 - psutil.virtual_memory().percent 26 avail_mem_pct = 100.0 - psutil.virtual_memory().percent
@@ -30,6 +33,38 @@ class RAMSensor(SensorInterface):
30 def __init__(self, config=None): 33 def __init__(self, config=None):
31 self.min_avail_mem = float(get_default(config, 'executor', 34 self.min_avail_mem = float(get_default(config, 'executor',
32 'min_avail_mem', '5.0')) 35 'min_avail_mem', '5.0'))
36 self.cgroup_stats_file = CGROUP_STATS_FILE
37
38 def _read_cgroup_stat(self):
39 stat = {}
40 try:
41 with open(self.cgroup_stats_file) as f:
42 for line in f.readlines():
43 key, value = line.split(' ')
44 stat[key] = int(value.strip())
45 except Exception:
46 pass
47 return stat
48
49 def _get_cgroup_limit(self):
50 stat = self._read_cgroup_stat()
51 limit = stat.get('hierarchical_memory_limit', math.inf)
52 mem_total = psutil.virtual_memory().total
53 if limit < mem_total:
54 return limit
55 else:
56 return math.inf
57
58 def _get_avail_mem_pct_cgroup(self):
59 stat = self._read_cgroup_stat()
60 limit = stat.get('hierarchical_memory_limit', math.inf)
61 usage = stat.get('total_rss', math.inf)
62
63 if math.isinf(limit) or math.isinf(usage):
64 # pretend we have all memory available if we got infs
65 return 100
66
67 return 100.0 - usage / limit * 100
33 68
34 def isOk(self): 69 def isOk(self):
35 avail_mem_pct = get_avail_mem_pct() 70 avail_mem_pct = get_avail_mem_pct()
@@ -38,10 +73,27 @@ class RAMSensor(SensorInterface):
38 return False, "low memory {:3.1f}% < {}".format( 73 return False, "low memory {:3.1f}% < {}".format(
39 avail_mem_pct, self.min_avail_mem) 74 avail_mem_pct, self.min_avail_mem)
40 75
41 return True, "{:3.1f}% <= {}".format(avail_mem_pct, self.min_avail_mem) 76 if math.isinf(self._get_cgroup_limit()):
77 # we have no cgroup defined limit so we're done now
78 return True, "{:3.1f}% <= {}".format(
79 avail_mem_pct, self.min_avail_mem)
80
81 avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
82 if avail_mem_pct_cgroup < self.min_avail_mem:
83 return False, "low memory cgroup {:3.1f}% < {}".format(
84 avail_mem_pct_cgroup, self.min_avail_mem)
85
86 return True, "{:3.1f}% <= {}, {:3.1f}% <= {}".format(
87 avail_mem_pct, self.min_avail_mem,
88 avail_mem_pct_cgroup, self.min_avail_mem)
42 89
43 def reportStats(self, statsd, base_key): 90 def reportStats(self, statsd, base_key):
44 avail_mem_pct = get_avail_mem_pct() 91 avail_mem_pct = get_avail_mem_pct()
45 92
46 statsd.gauge(base_key + '.pct_used_ram', 93 statsd.gauge(base_key + '.pct_used_ram',
47 int((100.0 - avail_mem_pct) * 100)) 94 int((100.0 - avail_mem_pct) * 100))
95
96 if math.isfinite(self._get_cgroup_limit()):
97 avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
98 statsd.gauge(base_key + '.pct_used_ram_cgroup',
99 int((100.0 - avail_mem_pct_cgroup) * 100))