Merge "Rework zuul nodepool stats reporting"
This commit is contained in:
commit
a6ba568d72
|
@ -188,78 +188,69 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
|||
The used RAM (excluding buffers and cache) on this executor, as
|
||||
a percentage multiplied by 100.
|
||||
|
||||
.. stat:: zuul.nodepool
|
||||
.. stat:: zuul.nodepool.requests
|
||||
|
||||
Holds metrics related to Zuul requests from Nodepool.
|
||||
Holds metrics related to Zuul requests and responses from Nodepool.
|
||||
|
||||
.. stat:: requested
|
||||
States are one of:
|
||||
|
||||
*requested*
|
||||
Node request submitted by Zuul to Nodepool
|
||||
*canceled*
|
||||
Node request was canceled by Zuul
|
||||
*failed*
|
||||
Nodepool failed to fulfill a node request
|
||||
*fulfilled*
|
||||
Nodes were assigned by Nodepool
|
||||
|
||||
.. stat:: <state>
|
||||
:type: timer
|
||||
|
||||
Records the elapsed time from request to completion for states
|
||||
`failed` and `fulfilled`. For example,
|
||||
``zuul.nodepool.request.fulfilled.mean`` will give the average
|
||||
time for all fulfilled requests within each ``statsd`` flush
|
||||
interval.
|
||||
|
||||
A lower value for `fulfilled` requests is better. Ideally,
|
||||
there will be no `failed` requests.
|
||||
|
||||
.. stat:: <state>.total
|
||||
:type: counter
|
||||
|
||||
Incremented each time a node request is submitted to Nodepool.
|
||||
Incremented when nodes are assigned or removed as described in
|
||||
the states above.
|
||||
|
||||
.. stat:: label.<label>
|
||||
:type: counter
|
||||
|
||||
Incremented each time a request for a specific label is
|
||||
submitted to Nodepool.
|
||||
|
||||
.. stat:: size.<size>
|
||||
:type: counter
|
||||
|
||||
Incremented each time a request of a specific size is submitted
|
||||
to Nodepool. For example, a request for 3 nodes would use the
|
||||
key ``zuul.nodepool.requested.size.3``.
|
||||
|
||||
.. stat:: canceled
|
||||
.. stat:: <state>.size.<size>
|
||||
:type: counter, timer
|
||||
|
||||
The counter is incremented each time a node request is canceled
|
||||
by Zuul. The timer records the elapsed time from request to
|
||||
cancelation.
|
||||
Increments for the node count of each request. For example, a
|
||||
request for 3 nodes would use the key
|
||||
``zuul.nodepool.requests.requested.size.3``; fulfillment of 3
|
||||
node requests can be tracked with
|
||||
``zuul.nodepool.requests.fulfilled.size.3``.
|
||||
|
||||
.. stat:: label.<label>
|
||||
:type: counter, timer
|
||||
The timer is implemented for ``fulfilled`` and ``failed``
|
||||
requests. For example, the timer
|
||||
``zuul.nodepool.requests.failed.size.3.mean`` gives the average
|
||||
time of 3-node failed requests within the ``statsd`` flush
|
||||
interval. A lower value for `fulfilled` requests is better.
|
||||
Ideally, there will be no `failed` requests.
|
||||
|
||||
The same, for a specific label.
|
||||
|
||||
.. stat:: size.<size>
|
||||
:type: counter, timer
|
||||
|
||||
The same, for a specific request size.
|
||||
|
||||
.. stat:: fulfilled
|
||||
.. stat:: <state>.label.<label>
|
||||
:type: counter, timer
|
||||
|
||||
The counter is incremented each time a node request is fulfilled
|
||||
by Nodepool. The timer records the elapsed time from request to
|
||||
fulfillment.
|
||||
Increments for the label of each request. For example, requests
|
||||
for `centos7` nodes could be tracked with
|
||||
``zuul.nodepool.requests.requested.centos7``.
|
||||
|
||||
.. stat:: label.<label>
|
||||
:type: counter, timer
|
||||
|
||||
The same, for a specific label.
|
||||
|
||||
.. stat:: size.<size>
|
||||
:type: counter, timer
|
||||
|
||||
The same, for a specific request size.
|
||||
|
||||
.. stat:: failed
|
||||
:type: counter, timer
|
||||
|
||||
The counter is incremented each time Nodepool fails to fulfill a
|
||||
node request. The timer records the elapsed time from request
|
||||
to failure.
|
||||
|
||||
.. stat:: label.<label>
|
||||
:type: counter, timer
|
||||
|
||||
The same, for a specific label.
|
||||
|
||||
.. stat:: size.<size>
|
||||
:type: counter, timer
|
||||
|
||||
The same, for a specific request size.
|
||||
The timer is implemented for ``fulfilled`` and ``failed``
|
||||
requests. For example, the timer
|
||||
``zuul.nodepool.requests.fulfilled.label.centos7.mean`` gives
|
||||
the average time of ``centos7`` fulfilled requests within the
|
||||
``statsd`` flush interval. A lower value for `fulfilled`
|
||||
requests is better. Ideally, there will be no `failed`
|
||||
requests.
|
||||
|
||||
.. stat:: current_requests
|
||||
:type: gauge
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
upgrade:
|
||||
- |
|
||||
The `zuul.nodepool` statistics have been moved under
|
||||
`zuul.nodepool.requests` to allow sub-stats to work correctly.
|
||||
For example `zuul.nodepool.requested` has become
|
||||
`zuul.nodepool.requests.requested.total`. The previously missing
|
||||
`label` and `size` counters are now available at
|
||||
`zuul.nodepool.requests.<state>.<size|label>`. For more info see
|
||||
the monitoring documentation.
|
|
@ -1294,6 +1294,8 @@ class BuildHistory(object):
|
|||
|
||||
|
||||
class FakeStatsd(threading.Thread):
|
||||
log = logging.getLogger("zuul.test.FakeStatsd")
|
||||
|
||||
def __init__(self):
|
||||
threading.Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
@ -1314,6 +1316,7 @@ class FakeStatsd(threading.Thread):
|
|||
data = self.sock.recvfrom(1024)
|
||||
if not data:
|
||||
return
|
||||
self.log.debug("Appending: %s" % data[0])
|
||||
self.stats.append(data[0])
|
||||
if fd == self.wake_read:
|
||||
return
|
||||
|
|
|
@ -130,9 +130,6 @@ class TestScheduler(ZuulTestCase):
|
|||
self.assertEqual(self.getJobFromHistory('project-test2').node,
|
||||
'label1')
|
||||
|
||||
for stat in self.statsd.stats:
|
||||
k, v = stat.decode('utf-8').split(':')
|
||||
self.log.debug('stat %s:%s', k, v)
|
||||
# TODOv3(jeblair): we may want to report stats by tenant (also?).
|
||||
# Per-driver
|
||||
self.assertReportedStat('zuul.event.gerrit.comment-added', value='1',
|
||||
|
@ -164,23 +161,26 @@ class TestScheduler(ZuulTestCase):
|
|||
exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
|
||||
'.', '_')
|
||||
self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
|
||||
self.assertReportedStat('zuul.nodepool.requested', value='1', kind='c')
|
||||
self.assertReportedStat('zuul.nodepool.requested.label.label1',
|
||||
value='1', kind='c')
|
||||
self.assertReportedStat('zuul.nodepool.fulfilled.label.label1',
|
||||
value='1', kind='c')
|
||||
self.assertReportedStat('zuul.nodepool.requested.size.1', value='1',
|
||||
kind='c')
|
||||
self.assertReportedStat('zuul.nodepool.fulfilled.size.1', value='1',
|
||||
kind='c')
|
||||
self.assertReportedStat('zuul.nodepool.current_requests', value='1',
|
||||
kind='g')
|
||||
self.assertReportedStat('zuul.executors.online', value='1',
|
||||
kind='g')
|
||||
self.assertReportedStat('zuul.executors.accepting', value='1',
|
||||
kind='g')
|
||||
self.assertReportedStat('zuul.mergers.online', value='1',
|
||||
kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.requests.requested.total', value='1', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.requests.requested.label.label1',
|
||||
value='1', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.requests.fulfilled.label.label1',
|
||||
value='1', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.requests.requested.size.1', value='1', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.requests.fulfilled.size.1', value='1', kind='c')
|
||||
self.assertReportedStat(
|
||||
'zuul.nodepool.current_requests', value='1', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.executors.online', value='1', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.executors.accepting', value='1', kind='g')
|
||||
self.assertReportedStat(
|
||||
'zuul.mergers.online', value='1', kind='g')
|
||||
|
||||
for build in self.history:
|
||||
self.assertTrue(build.parameters['zuul']['voting'])
|
||||
|
|
|
@ -24,33 +24,40 @@ class Nodepool(object):
|
|||
self.sched = scheduler
|
||||
|
||||
def emitStats(self, request):
|
||||
# Implements the following :
|
||||
# counter zuul.nodepool.requests.<state>.total
|
||||
# counter zuul.nodepool.requests.<state>.label.<label>
|
||||
# counter zuul.nodepool.requests.<state>.size.<size>
|
||||
# timer zuul.nodepool.requests.(fulfilled|failed)
|
||||
# timer zuul.nodepool.requests.(fulfilled|failed).<label>
|
||||
# timer zuul.nodepool.requests.(fulfilled|failed).<size>
|
||||
# gauge zuul.nodepool.current_requests
|
||||
if not self.sched.statsd:
|
||||
return
|
||||
statsd = self.sched.statsd
|
||||
# counter zuul.nodepool.requested
|
||||
# counter zuul.nodepool.requested.label.<label>
|
||||
# counter zuul.nodepool.requested.size.<size>
|
||||
# gauge zuul.nodepool.current_requests
|
||||
pipe = statsd.pipeline()
|
||||
state = request.state
|
||||
dt = None
|
||||
|
||||
if request.canceled:
|
||||
state = 'canceled'
|
||||
dt = None
|
||||
elif request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
|
||||
dt = int((request.state_time - request.requested_time) * 1000)
|
||||
else:
|
||||
dt = None
|
||||
key = 'zuul.nodepool.%s' % state
|
||||
statsd.incr(key)
|
||||
|
||||
key = 'zuul.nodepool.requests.%s' % state
|
||||
pipe.incr(key + ".total")
|
||||
|
||||
if dt:
|
||||
statsd.timing(key, dt)
|
||||
pipe.timing(key, dt)
|
||||
for node in request.nodeset.getNodes():
|
||||
statsd.incr(key + '.label.%s' % node.label)
|
||||
pipe.incr(key + '.label.%s' % node.label)
|
||||
if dt:
|
||||
statsd.timing(key + '.label.%s' % node.label, dt)
|
||||
statsd.incr(key + '.size.%s' % len(request.nodeset.nodes))
|
||||
pipe.timing(key + '.label.%s' % node.label, dt)
|
||||
pipe.incr(key + '.size.%s' % len(request.nodeset.nodes))
|
||||
if dt:
|
||||
statsd.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
|
||||
statsd.gauge('zuul.nodepool.current_requests', len(self.requests))
|
||||
pipe.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
|
||||
pipe.gauge('zuul.nodepool.current_requests', len(self.requests))
|
||||
pipe.send()
|
||||
|
||||
def requestNodes(self, build_set, job, relative_priority):
|
||||
# Create a copy of the nodeset to represent the actual nodes
|
||||
|
|
Loading…
Reference in New Issue