summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2019-01-09 19:32:24 +0000
committerGerrit Code Review <review@openstack.org>2019-01-09 19:32:24 +0000
commit97da909bd8c59f1b0574219ba4446ef0e79f7ba9 (patch)
treea0ca7289bc57d843e8278cecdf1bda49c15ad8ce
parent9801e8a07cd12548e0ac34f860932541fd2eecd5 (diff)
parent145e62b568128b4e58f85bb2d04420f4f7c44a5d (diff)
Merge "Add cgroup support to ram sensor"
-rw-r--r--doc/source/admin/monitoring.rst6
-rw-r--r--releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml6
-rw-r--r--tests/fixtures/cgroup/memory.stat.bad34
-rw-r--r--tests/fixtures/cgroup/memory.stat.nolimit34
-rw-r--r--tests/fixtures/cgroup/memory.stat.ok34
-rw-r--r--tests/unit/test_executor.py50
-rw-r--r--zuul/executor/sensors/ram.py54
7 files changed, 216 insertions, 2 deletions
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst
index ccde1c7..a51d175 100644
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@@ -210,6 +210,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
210 The used RAM (excluding buffers and cache) on this executor, as 210 The used RAM (excluding buffers and cache) on this executor, as
211 a percentage multiplied by 100. 211 a percentage multiplied by 100.
212 212
213 .. stat:: pct_used_ram_cgroup
214 :type: gauge
215
216 The used RAM (excluding buffers and cache) on this executor allowed by
217 the cgroup, as percentage multiplied by 100.
218
213.. stat:: zuul.nodepool.requests 219.. stat:: zuul.nodepool.requests
214 220
215 Holds metrics related to Zuul requests and responses from Nodepool. 221 Holds metrics related to Zuul requests and responses from Nodepool.
diff --git a/releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml b/releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml
new file mode 100644
index 0000000..fed40ef
--- /dev/null
+++ b/releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml
@@ -0,0 +1,6 @@
1---
2features:
3 - |
4 The :attr:`executor.min_avail_mem` setting now takes cgroup limits
5 into account. There is also a new metric
6 `zuul.executor.<executor>.pct_used_ram_cgroup` available.
diff --git a/tests/fixtures/cgroup/memory.stat.bad b/tests/fixtures/cgroup/memory.stat.bad
new file mode 100644
index 0000000..2c92310
--- /dev/null
+++ b/tests/fixtures/cgroup/memory.stat.bad
@@ -0,0 +1,34 @@
1cache 0
2rss 561152
3rss_huge 0
4mapped_file 0
5dirty 0
6writeback 0
7swap 0
8pgpgin 654
9pgpgout 517
10pgfault 1089
11pgmajfault 0
12inactive_anon 0
13active_anon 454656
14inactive_file 0
15active_file 0
16unevictable 0
17hierarchical_memory_limit 5368709120
18hierarchical_memsw_limit 5368709120
19total_cache 0
20total_rss 5153960755
21total_rss_huge 0
22total_mapped_file 0
23total_dirty 0
24total_writeback 0
25total_swap 0
26total_pgpgin 654
27total_pgpgout 517
28total_pgfault 1089
29total_pgmajfault 0
30total_inactive_anon 0
31total_active_anon 454656
32total_inactive_file 0
33total_active_file 0
34total_unevictable 0
diff --git a/tests/fixtures/cgroup/memory.stat.nolimit b/tests/fixtures/cgroup/memory.stat.nolimit
new file mode 100644
index 0000000..ed4f378
--- /dev/null
+++ b/tests/fixtures/cgroup/memory.stat.nolimit
@@ -0,0 +1,34 @@
1cache 0
2rss 561152
3rss_huge 0
4mapped_file 0
5dirty 0
6writeback 0
7swap 0
8pgpgin 654
9pgpgout 517
10pgfault 1089
11pgmajfault 0
12inactive_anon 0
13active_anon 454656
14inactive_file 0
15active_file 0
16unevictable 0
17hierarchical_memory_limit 9223372036854771712
18hierarchical_memsw_limit 9223372036854771712
19total_cache 0
20total_rss 561152
21total_rss_huge 0
22total_mapped_file 0
23total_dirty 0
24total_writeback 0
25total_swap 0
26total_pgpgin 654
27total_pgpgout 517
28total_pgfault 1089
29total_pgmajfault 0
30total_inactive_anon 0
31total_active_anon 454656
32total_inactive_file 0
33total_active_file 0
34total_unevictable 0
diff --git a/tests/fixtures/cgroup/memory.stat.ok b/tests/fixtures/cgroup/memory.stat.ok
new file mode 100644
index 0000000..ab62da5
--- /dev/null
+++ b/tests/fixtures/cgroup/memory.stat.ok
@@ -0,0 +1,34 @@
1cache 0
2rss 561152
3rss_huge 0
4mapped_file 0
5dirty 0
6writeback 0
7swap 0
8pgpgin 654
9pgpgout 517
10pgfault 1089
11pgmajfault 0
12inactive_anon 0
13active_anon 454656
14inactive_file 0
15active_file 0
16unevictable 0
17hierarchical_memory_limit 5368709120
18hierarchical_memsw_limit 5368709120
19total_cache 0
20total_rss 1073741824
21total_rss_huge 0
22total_mapped_file 0
23total_dirty 0
24total_writeback 0
25total_swap 0
26total_pgpgin 654
27total_pgpgout 517
28total_pgfault 1089
29total_pgmajfault 0
30total_inactive_anon 0
31total_active_anon 454656
32total_inactive_file 0
33total_active_file 0
34total_unevictable 0
diff --git a/tests/unit/test_executor.py b/tests/unit/test_executor.py
index 2814733..a3dfccc 100644
--- a/tests/unit/test_executor.py
+++ b/tests/unit/test_executor.py
@@ -31,6 +31,7 @@ from tests.base import (
31) 31)
32 32
33from zuul.executor.sensors.startingbuilds import StartingBuildsSensor 33from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
34from zuul.executor.sensors.ram import RAMSensor
34 35
35 36
36class TestExecutorRepos(ZuulTestCase): 37class TestExecutorRepos(ZuulTestCase):
@@ -466,15 +467,62 @@ class TestGovernor(ZuulTestCase):
466 pass 467 pass
467 ram = Dummy() 468 ram = Dummy()
468 ram.percent = 20.0 # 20% used 469 ram.percent = 20.0 # 20% used
470 ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
469 vm_mock.return_value = ram 471 vm_mock.return_value = ram
470 loadavg_mock.return_value = (0.0, 0.0, 0.0) 472 loadavg_mock.return_value = (0.0, 0.0, 0.0)
471 self.executor_server.manageLoad() 473 self.executor_server.manageLoad()
472 self.assertTrue(self.executor_server.accepting_work) 474 self.assertTrue(self.executor_server.accepting_work)
473 ram.percent = 99.0 # 99% used
474 loadavg_mock.return_value = (100.0, 100.0, 100.0) 475 loadavg_mock.return_value = (100.0, 100.0, 100.0)
475 self.executor_server.manageLoad() 476 self.executor_server.manageLoad()
476 self.assertFalse(self.executor_server.accepting_work) 477 self.assertFalse(self.executor_server.accepting_work)
477 478
479 @mock.patch('os.getloadavg')
480 @mock.patch('psutil.virtual_memory')
481 def test_ram_governor(self, vm_mock, loadavg_mock):
482 class Dummy(object):
483 pass
484 ram = Dummy()
485 ram.percent = 20.0 # 20% used
486 ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
487 vm_mock.return_value = ram
488 loadavg_mock.return_value = (0.0, 0.0, 0.0)
489 self.executor_server.manageLoad()
490 self.assertTrue(self.executor_server.accepting_work)
491 ram.percent = 99.0 # 99% used
492 self.executor_server.manageLoad()
493 self.assertFalse(self.executor_server.accepting_work)
494
495 @mock.patch('os.getloadavg')
496 @mock.patch('psutil.virtual_memory')
497 def test_ram_cgroup_governor(self, vm_mock, loadavg_mock):
498 class Dummy(object):
499 pass
500 ram = Dummy()
501 ram.percent = 20.0 # 20% used
502 ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
503 vm_mock.return_value = ram
504 loadavg_mock.return_value = (0.0, 0.0, 0.0)
505
506 # Set no cgroup limit
507 ram_sensor = [x for x in self.executor_server.sensors
508 if isinstance(x, RAMSensor)][0]
509 ram_sensor.cgroup_stats_file = os.path.join(
510 FIXTURE_DIR, 'cgroup', 'memory.stat.nolimit')
511 self.executor_server.manageLoad()
512 self.assertTrue(self.executor_server.accepting_work)
513
514 # Set cgroup limit 5GiB and ram usage 20%
515 ram_sensor.cgroup_stats_file = os.path.join(
516 FIXTURE_DIR, 'cgroup', 'memory.stat.ok')
517 self.executor_server.manageLoad()
518 self.assertTrue(self.executor_server.accepting_work)
519
520 # Set cgroup limit 5GiB and ram usage 96%
521 ram_sensor.cgroup_stats_file = os.path.join(
522 FIXTURE_DIR, 'cgroup', 'memory.stat.bad')
523 self.executor_server.manageLoad()
524 self.assertFalse(self.executor_server.accepting_work)
525
478 @mock.patch('os.statvfs') 526 @mock.patch('os.statvfs')
479 def test_hdd_governor(self, statvfs_mock): 527 def test_hdd_governor(self, statvfs_mock):
480 class Dummy(object): 528 class Dummy(object):
diff --git a/zuul/executor/sensors/ram.py b/zuul/executor/sensors/ram.py
index 9ffbd7b..33ed1a7 100644
--- a/zuul/executor/sensors/ram.py
+++ b/zuul/executor/sensors/ram.py
@@ -13,11 +13,14 @@
13# under the License. 13# under the License.
14 14
15import logging 15import logging
16import math
16import psutil 17import psutil
17 18
18from zuul.executor.sensors import SensorInterface 19from zuul.executor.sensors import SensorInterface
19from zuul.lib.config import get_default 20from zuul.lib.config import get_default
20 21
22CGROUP_STATS_FILE = '/sys/fs/cgroup/memory/memory.stat'
23
21 24
22def get_avail_mem_pct(): 25def get_avail_mem_pct():
23 avail_mem_pct = 100.0 - psutil.virtual_memory().percent 26 avail_mem_pct = 100.0 - psutil.virtual_memory().percent
@@ -30,6 +33,38 @@ class RAMSensor(SensorInterface):
30 def __init__(self, config=None): 33 def __init__(self, config=None):
31 self.min_avail_mem = float(get_default(config, 'executor', 34 self.min_avail_mem = float(get_default(config, 'executor',
32 'min_avail_mem', '5.0')) 35 'min_avail_mem', '5.0'))
36 self.cgroup_stats_file = CGROUP_STATS_FILE
37
38 def _read_cgroup_stat(self):
39 stat = {}
40 try:
41 with open(self.cgroup_stats_file) as f:
42 for line in f.readlines():
43 key, value = line.split(' ')
44 stat[key] = int(value.strip())
45 except Exception:
46 pass
47 return stat
48
49 def _get_cgroup_limit(self):
50 stat = self._read_cgroup_stat()
51 limit = stat.get('hierarchical_memory_limit', math.inf)
52 mem_total = psutil.virtual_memory().total
53 if limit < mem_total:
54 return limit
55 else:
56 return math.inf
57
58 def _get_avail_mem_pct_cgroup(self):
59 stat = self._read_cgroup_stat()
60 limit = stat.get('hierarchical_memory_limit', math.inf)
61 usage = stat.get('total_rss', math.inf)
62
63 if math.isinf(limit) or math.isinf(usage):
64 # pretend we have all memory available if we got infs
65 return 100
66
67 return 100.0 - usage / limit * 100
33 68
34 def isOk(self): 69 def isOk(self):
35 avail_mem_pct = get_avail_mem_pct() 70 avail_mem_pct = get_avail_mem_pct()
@@ -38,10 +73,27 @@ class RAMSensor(SensorInterface):
38 return False, "low memory {:3.1f}% < {}".format( 73 return False, "low memory {:3.1f}% < {}".format(
39 avail_mem_pct, self.min_avail_mem) 74 avail_mem_pct, self.min_avail_mem)
40 75
41 return True, "{:3.1f}% <= {}".format(avail_mem_pct, self.min_avail_mem) 76 if math.isinf(self._get_cgroup_limit()):
77 # we have no cgroup defined limit so we're done now
78 return True, "{:3.1f}% <= {}".format(
79 avail_mem_pct, self.min_avail_mem)
80
81 avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
82 if avail_mem_pct_cgroup < self.min_avail_mem:
83 return False, "low memory cgroup {:3.1f}% < {}".format(
84 avail_mem_pct_cgroup, self.min_avail_mem)
85
86 return True, "{:3.1f}% <= {}, {:3.1f}% <= {}".format(
87 avail_mem_pct, self.min_avail_mem,
88 avail_mem_pct_cgroup, self.min_avail_mem)
42 89
43 def reportStats(self, statsd, base_key): 90 def reportStats(self, statsd, base_key):
44 avail_mem_pct = get_avail_mem_pct() 91 avail_mem_pct = get_avail_mem_pct()
45 92
46 statsd.gauge(base_key + '.pct_used_ram', 93 statsd.gauge(base_key + '.pct_used_ram',
47 int((100.0 - avail_mem_pct) * 100)) 94 int((100.0 - avail_mem_pct) * 100))
95
96 if math.isfinite(self._get_cgroup_limit()):
97 avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
98 statsd.gauge(base_key + '.pct_used_ram_cgroup',
99 int((100.0 - avail_mem_pct_cgroup) * 100))