Merge "Add cgroup support to ram sensor"
This commit is contained in:
commit
97da909bd8
|
@ -210,6 +210,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
|||
The used RAM (excluding buffers and cache) on this executor, as
|
||||
a percentage multiplied by 100.
|
||||
|
||||
.. stat:: pct_used_ram_cgroup
|
||||
:type: gauge
|
||||
|
||||
The used RAM (excluding buffers and cache) on this executor allowed by
|
||||
the cgroup, as percentage multiplied by 100.
|
||||
|
||||
.. stat:: zuul.nodepool.requests
|
||||
|
||||
Holds metrics related to Zuul requests and responses from Nodepool.
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
features:
|
||||
- |
|
||||
The :attr:`executor.min_avail_mem` setting now takes cgroup limits
|
||||
into account. There is also a new metric
|
||||
`zuul.executor.<executor>.pct_used_ram_cgroup` available.
|
|
@ -0,0 +1,34 @@
|
|||
cache 0
|
||||
rss 561152
|
||||
rss_huge 0
|
||||
mapped_file 0
|
||||
dirty 0
|
||||
writeback 0
|
||||
swap 0
|
||||
pgpgin 654
|
||||
pgpgout 517
|
||||
pgfault 1089
|
||||
pgmajfault 0
|
||||
inactive_anon 0
|
||||
active_anon 454656
|
||||
inactive_file 0
|
||||
active_file 0
|
||||
unevictable 0
|
||||
hierarchical_memory_limit 5368709120
|
||||
hierarchical_memsw_limit 5368709120
|
||||
total_cache 0
|
||||
total_rss 5153960755
|
||||
total_rss_huge 0
|
||||
total_mapped_file 0
|
||||
total_dirty 0
|
||||
total_writeback 0
|
||||
total_swap 0
|
||||
total_pgpgin 654
|
||||
total_pgpgout 517
|
||||
total_pgfault 1089
|
||||
total_pgmajfault 0
|
||||
total_inactive_anon 0
|
||||
total_active_anon 454656
|
||||
total_inactive_file 0
|
||||
total_active_file 0
|
||||
total_unevictable 0
|
|
@ -0,0 +1,34 @@
|
|||
cache 0
|
||||
rss 561152
|
||||
rss_huge 0
|
||||
mapped_file 0
|
||||
dirty 0
|
||||
writeback 0
|
||||
swap 0
|
||||
pgpgin 654
|
||||
pgpgout 517
|
||||
pgfault 1089
|
||||
pgmajfault 0
|
||||
inactive_anon 0
|
||||
active_anon 454656
|
||||
inactive_file 0
|
||||
active_file 0
|
||||
unevictable 0
|
||||
hierarchical_memory_limit 9223372036854771712
|
||||
hierarchical_memsw_limit 9223372036854771712
|
||||
total_cache 0
|
||||
total_rss 561152
|
||||
total_rss_huge 0
|
||||
total_mapped_file 0
|
||||
total_dirty 0
|
||||
total_writeback 0
|
||||
total_swap 0
|
||||
total_pgpgin 654
|
||||
total_pgpgout 517
|
||||
total_pgfault 1089
|
||||
total_pgmajfault 0
|
||||
total_inactive_anon 0
|
||||
total_active_anon 454656
|
||||
total_inactive_file 0
|
||||
total_active_file 0
|
||||
total_unevictable 0
|
|
@ -0,0 +1,34 @@
|
|||
cache 0
|
||||
rss 561152
|
||||
rss_huge 0
|
||||
mapped_file 0
|
||||
dirty 0
|
||||
writeback 0
|
||||
swap 0
|
||||
pgpgin 654
|
||||
pgpgout 517
|
||||
pgfault 1089
|
||||
pgmajfault 0
|
||||
inactive_anon 0
|
||||
active_anon 454656
|
||||
inactive_file 0
|
||||
active_file 0
|
||||
unevictable 0
|
||||
hierarchical_memory_limit 5368709120
|
||||
hierarchical_memsw_limit 5368709120
|
||||
total_cache 0
|
||||
total_rss 1073741824
|
||||
total_rss_huge 0
|
||||
total_mapped_file 0
|
||||
total_dirty 0
|
||||
total_writeback 0
|
||||
total_swap 0
|
||||
total_pgpgin 654
|
||||
total_pgpgout 517
|
||||
total_pgfault 1089
|
||||
total_pgmajfault 0
|
||||
total_inactive_anon 0
|
||||
total_active_anon 454656
|
||||
total_inactive_file 0
|
||||
total_active_file 0
|
||||
total_unevictable 0
|
|
@ -31,6 +31,7 @@ from tests.base import (
|
|||
)
|
||||
|
||||
from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
|
||||
from zuul.executor.sensors.ram import RAMSensor
|
||||
|
||||
|
||||
class TestExecutorRepos(ZuulTestCase):
|
||||
|
@ -466,12 +467,59 @@ class TestGovernor(ZuulTestCase):
|
|||
pass
|
||||
ram = Dummy()
|
||||
ram.percent = 20.0 # 20% used
|
||||
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
|
||||
vm_mock.return_value = ram
|
||||
loadavg_mock.return_value = (0.0, 0.0, 0.0)
|
||||
self.executor_server.manageLoad()
|
||||
self.assertTrue(self.executor_server.accepting_work)
|
||||
loadavg_mock.return_value = (100.0, 100.0, 100.0)
|
||||
self.executor_server.manageLoad()
|
||||
self.assertFalse(self.executor_server.accepting_work)
|
||||
|
||||
@mock.patch('os.getloadavg')
|
||||
@mock.patch('psutil.virtual_memory')
|
||||
def test_ram_governor(self, vm_mock, loadavg_mock):
|
||||
class Dummy(object):
|
||||
pass
|
||||
ram = Dummy()
|
||||
ram.percent = 20.0 # 20% used
|
||||
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
|
||||
vm_mock.return_value = ram
|
||||
loadavg_mock.return_value = (0.0, 0.0, 0.0)
|
||||
self.executor_server.manageLoad()
|
||||
self.assertTrue(self.executor_server.accepting_work)
|
||||
ram.percent = 99.0 # 99% used
|
||||
loadavg_mock.return_value = (100.0, 100.0, 100.0)
|
||||
self.executor_server.manageLoad()
|
||||
self.assertFalse(self.executor_server.accepting_work)
|
||||
|
||||
@mock.patch('os.getloadavg')
|
||||
@mock.patch('psutil.virtual_memory')
|
||||
def test_ram_cgroup_governor(self, vm_mock, loadavg_mock):
|
||||
class Dummy(object):
|
||||
pass
|
||||
ram = Dummy()
|
||||
ram.percent = 20.0 # 20% used
|
||||
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
|
||||
vm_mock.return_value = ram
|
||||
loadavg_mock.return_value = (0.0, 0.0, 0.0)
|
||||
|
||||
# Set no cgroup limit
|
||||
ram_sensor = [x for x in self.executor_server.sensors
|
||||
if isinstance(x, RAMSensor)][0]
|
||||
ram_sensor.cgroup_stats_file = os.path.join(
|
||||
FIXTURE_DIR, 'cgroup', 'memory.stat.nolimit')
|
||||
self.executor_server.manageLoad()
|
||||
self.assertTrue(self.executor_server.accepting_work)
|
||||
|
||||
# Set cgroup limit 5GiB and ram usage 20%
|
||||
ram_sensor.cgroup_stats_file = os.path.join(
|
||||
FIXTURE_DIR, 'cgroup', 'memory.stat.ok')
|
||||
self.executor_server.manageLoad()
|
||||
self.assertTrue(self.executor_server.accepting_work)
|
||||
|
||||
# Set cgroup limit 5GiB and ram usage 96%
|
||||
ram_sensor.cgroup_stats_file = os.path.join(
|
||||
FIXTURE_DIR, 'cgroup', 'memory.stat.bad')
|
||||
self.executor_server.manageLoad()
|
||||
self.assertFalse(self.executor_server.accepting_work)
|
||||
|
||||
|
|
|
@ -13,11 +13,14 @@
|
|||
# under the License.
|
||||
|
||||
import logging
|
||||
import math
|
||||
import psutil
|
||||
|
||||
from zuul.executor.sensors import SensorInterface
|
||||
from zuul.lib.config import get_default
|
||||
|
||||
CGROUP_STATS_FILE = '/sys/fs/cgroup/memory/memory.stat'
|
||||
|
||||
|
||||
def get_avail_mem_pct():
|
||||
avail_mem_pct = 100.0 - psutil.virtual_memory().percent
|
||||
|
@ -30,6 +33,38 @@ class RAMSensor(SensorInterface):
|
|||
def __init__(self, config=None):
|
||||
self.min_avail_mem = float(get_default(config, 'executor',
|
||||
'min_avail_mem', '5.0'))
|
||||
self.cgroup_stats_file = CGROUP_STATS_FILE
|
||||
|
||||
def _read_cgroup_stat(self):
|
||||
stat = {}
|
||||
try:
|
||||
with open(self.cgroup_stats_file) as f:
|
||||
for line in f.readlines():
|
||||
key, value = line.split(' ')
|
||||
stat[key] = int(value.strip())
|
||||
except Exception:
|
||||
pass
|
||||
return stat
|
||||
|
||||
def _get_cgroup_limit(self):
|
||||
stat = self._read_cgroup_stat()
|
||||
limit = stat.get('hierarchical_memory_limit', math.inf)
|
||||
mem_total = psutil.virtual_memory().total
|
||||
if limit < mem_total:
|
||||
return limit
|
||||
else:
|
||||
return math.inf
|
||||
|
||||
def _get_avail_mem_pct_cgroup(self):
|
||||
stat = self._read_cgroup_stat()
|
||||
limit = stat.get('hierarchical_memory_limit', math.inf)
|
||||
usage = stat.get('total_rss', math.inf)
|
||||
|
||||
if math.isinf(limit) or math.isinf(usage):
|
||||
# pretend we have all memory available if we got infs
|
||||
return 100
|
||||
|
||||
return 100.0 - usage / limit * 100
|
||||
|
||||
def isOk(self):
|
||||
avail_mem_pct = get_avail_mem_pct()
|
||||
|
@ -38,10 +73,27 @@ class RAMSensor(SensorInterface):
|
|||
return False, "low memory {:3.1f}% < {}".format(
|
||||
avail_mem_pct, self.min_avail_mem)
|
||||
|
||||
return True, "{:3.1f}% <= {}".format(avail_mem_pct, self.min_avail_mem)
|
||||
if math.isinf(self._get_cgroup_limit()):
|
||||
# we have no cgroup defined limit so we're done now
|
||||
return True, "{:3.1f}% <= {}".format(
|
||||
avail_mem_pct, self.min_avail_mem)
|
||||
|
||||
avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
|
||||
if avail_mem_pct_cgroup < self.min_avail_mem:
|
||||
return False, "low memory cgroup {:3.1f}% < {}".format(
|
||||
avail_mem_pct_cgroup, self.min_avail_mem)
|
||||
|
||||
return True, "{:3.1f}% <= {}, {:3.1f}% <= {}".format(
|
||||
avail_mem_pct, self.min_avail_mem,
|
||||
avail_mem_pct_cgroup, self.min_avail_mem)
|
||||
|
||||
def reportStats(self, statsd, base_key):
|
||||
avail_mem_pct = get_avail_mem_pct()
|
||||
|
||||
statsd.gauge(base_key + '.pct_used_ram',
|
||||
int((100.0 - avail_mem_pct) * 100))
|
||||
|
||||
if math.isfinite(self._get_cgroup_limit()):
|
||||
avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
|
||||
statsd.gauge(base_key + '.pct_used_ram_cgroup',
|
||||
int((100.0 - avail_mem_pct_cgroup) * 100))
|
||||
|
|
Loading…
Reference in New Issue