Merge "Add cgroup support to ram sensor"

This commit is contained in:
Zuul 2019-01-09 19:32:24 +00:00 committed by Gerrit Code Review
commit 97da909bd8
7 changed files with 216 additions and 2 deletions

View File

@ -210,6 +210,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
The used RAM (excluding buffers and cache) on this executor, as
a percentage multiplied by 100.
.. stat:: pct_used_ram_cgroup
:type: gauge
The used RAM (excluding buffers and cache) on this executor allowed by
the cgroup, as percentage multiplied by 100.
.. stat:: zuul.nodepool.requests
Holds metrics related to Zuul requests and responses from Nodepool.

View File

@ -0,0 +1,6 @@
---
features:
- |
The :attr:`executor.min_avail_mem` setting now takes cgroup limits
into account. There is also a new metric
`zuul.executor.<executor>.pct_used_ram_cgroup` available.

34
tests/fixtures/cgroup/memory.stat.bad vendored Normal file
View File

@ -0,0 +1,34 @@
cache 0
rss 561152
rss_huge 0
mapped_file 0
dirty 0
writeback 0
swap 0
pgpgin 654
pgpgout 517
pgfault 1089
pgmajfault 0
inactive_anon 0
active_anon 454656
inactive_file 0
active_file 0
unevictable 0
hierarchical_memory_limit 5368709120
hierarchical_memsw_limit 5368709120
total_cache 0
total_rss 5153960755
total_rss_huge 0
total_mapped_file 0
total_dirty 0
total_writeback 0
total_swap 0
total_pgpgin 654
total_pgpgout 517
total_pgfault 1089
total_pgmajfault 0
total_inactive_anon 0
total_active_anon 454656
total_inactive_file 0
total_active_file 0
total_unevictable 0

View File

@ -0,0 +1,34 @@
cache 0
rss 561152
rss_huge 0
mapped_file 0
dirty 0
writeback 0
swap 0
pgpgin 654
pgpgout 517
pgfault 1089
pgmajfault 0
inactive_anon 0
active_anon 454656
inactive_file 0
active_file 0
unevictable 0
hierarchical_memory_limit 9223372036854771712
hierarchical_memsw_limit 9223372036854771712
total_cache 0
total_rss 561152
total_rss_huge 0
total_mapped_file 0
total_dirty 0
total_writeback 0
total_swap 0
total_pgpgin 654
total_pgpgout 517
total_pgfault 1089
total_pgmajfault 0
total_inactive_anon 0
total_active_anon 454656
total_inactive_file 0
total_active_file 0
total_unevictable 0

34
tests/fixtures/cgroup/memory.stat.ok vendored Normal file
View File

@ -0,0 +1,34 @@
cache 0
rss 561152
rss_huge 0
mapped_file 0
dirty 0
writeback 0
swap 0
pgpgin 654
pgpgout 517
pgfault 1089
pgmajfault 0
inactive_anon 0
active_anon 454656
inactive_file 0
active_file 0
unevictable 0
hierarchical_memory_limit 5368709120
hierarchical_memsw_limit 5368709120
total_cache 0
total_rss 1073741824
total_rss_huge 0
total_mapped_file 0
total_dirty 0
total_writeback 0
total_swap 0
total_pgpgin 654
total_pgpgout 517
total_pgfault 1089
total_pgmajfault 0
total_inactive_anon 0
total_active_anon 454656
total_inactive_file 0
total_active_file 0
total_unevictable 0

View File

@ -31,6 +31,7 @@ from tests.base import (
)
from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
from zuul.executor.sensors.ram import RAMSensor
class TestExecutorRepos(ZuulTestCase):
@ -466,12 +467,59 @@ class TestGovernor(ZuulTestCase):
pass
ram = Dummy()
ram.percent = 20.0 # 20% used
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
vm_mock.return_value = ram
loadavg_mock.return_value = (0.0, 0.0, 0.0)
self.executor_server.manageLoad()
self.assertTrue(self.executor_server.accepting_work)
loadavg_mock.return_value = (100.0, 100.0, 100.0)
self.executor_server.manageLoad()
self.assertFalse(self.executor_server.accepting_work)
@mock.patch('os.getloadavg')
@mock.patch('psutil.virtual_memory')
def test_ram_governor(self, vm_mock, loadavg_mock):
class Dummy(object):
pass
ram = Dummy()
ram.percent = 20.0 # 20% used
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
vm_mock.return_value = ram
loadavg_mock.return_value = (0.0, 0.0, 0.0)
self.executor_server.manageLoad()
self.assertTrue(self.executor_server.accepting_work)
ram.percent = 99.0 # 99% used
loadavg_mock.return_value = (100.0, 100.0, 100.0)
self.executor_server.manageLoad()
self.assertFalse(self.executor_server.accepting_work)
@mock.patch('os.getloadavg')
@mock.patch('psutil.virtual_memory')
def test_ram_cgroup_governor(self, vm_mock, loadavg_mock):
class Dummy(object):
pass
ram = Dummy()
ram.percent = 20.0 # 20% used
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
vm_mock.return_value = ram
loadavg_mock.return_value = (0.0, 0.0, 0.0)
# Set no cgroup limit
ram_sensor = [x for x in self.executor_server.sensors
if isinstance(x, RAMSensor)][0]
ram_sensor.cgroup_stats_file = os.path.join(
FIXTURE_DIR, 'cgroup', 'memory.stat.nolimit')
self.executor_server.manageLoad()
self.assertTrue(self.executor_server.accepting_work)
# Set cgroup limit 5GiB and ram usage 20%
ram_sensor.cgroup_stats_file = os.path.join(
FIXTURE_DIR, 'cgroup', 'memory.stat.ok')
self.executor_server.manageLoad()
self.assertTrue(self.executor_server.accepting_work)
# Set cgroup limit 5GiB and ram usage 96%
ram_sensor.cgroup_stats_file = os.path.join(
FIXTURE_DIR, 'cgroup', 'memory.stat.bad')
self.executor_server.manageLoad()
self.assertFalse(self.executor_server.accepting_work)

View File

@ -13,11 +13,14 @@
# under the License.
import logging
import math
import psutil
from zuul.executor.sensors import SensorInterface
from zuul.lib.config import get_default
CGROUP_STATS_FILE = '/sys/fs/cgroup/memory/memory.stat'
def get_avail_mem_pct():
avail_mem_pct = 100.0 - psutil.virtual_memory().percent
@ -30,6 +33,38 @@ class RAMSensor(SensorInterface):
def __init__(self, config=None):
self.min_avail_mem = float(get_default(config, 'executor',
'min_avail_mem', '5.0'))
self.cgroup_stats_file = CGROUP_STATS_FILE
def _read_cgroup_stat(self):
stat = {}
try:
with open(self.cgroup_stats_file) as f:
for line in f.readlines():
key, value = line.split(' ')
stat[key] = int(value.strip())
except Exception:
pass
return stat
def _get_cgroup_limit(self):
stat = self._read_cgroup_stat()
limit = stat.get('hierarchical_memory_limit', math.inf)
mem_total = psutil.virtual_memory().total
if limit < mem_total:
return limit
else:
return math.inf
def _get_avail_mem_pct_cgroup(self):
stat = self._read_cgroup_stat()
limit = stat.get('hierarchical_memory_limit', math.inf)
usage = stat.get('total_rss', math.inf)
if math.isinf(limit) or math.isinf(usage):
# pretend we have all memory available if we got infs
return 100
return 100.0 - usage / limit * 100
def isOk(self):
avail_mem_pct = get_avail_mem_pct()
@ -38,10 +73,27 @@ class RAMSensor(SensorInterface):
return False, "low memory {:3.1f}% < {}".format(
avail_mem_pct, self.min_avail_mem)
return True, "{:3.1f}% <= {}".format(avail_mem_pct, self.min_avail_mem)
if math.isinf(self._get_cgroup_limit()):
# we have no cgroup defined limit so we're done now
return True, "{:3.1f}% <= {}".format(
avail_mem_pct, self.min_avail_mem)
avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
if avail_mem_pct_cgroup < self.min_avail_mem:
return False, "low memory cgroup {:3.1f}% < {}".format(
avail_mem_pct_cgroup, self.min_avail_mem)
return True, "{:3.1f}% <= {}, {:3.1f}% <= {}".format(
avail_mem_pct, self.min_avail_mem,
avail_mem_pct_cgroup, self.min_avail_mem)
def reportStats(self, statsd, base_key):
avail_mem_pct = get_avail_mem_pct()
statsd.gauge(base_key + '.pct_used_ram',
int((100.0 - avail_mem_pct) * 100))
if math.isfinite(self._get_cgroup_limit()):
avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
statsd.gauge(base_key + '.pct_used_ram_cgroup',
int((100.0 - avail_mem_pct_cgroup) * 100))