summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTobias Henkel <tobias.henkel@bmw.de>2018-12-07 13:22:20 +0100
committerTobias Henkel <tobias.henkel@bmw.de>2018-12-20 07:58:40 +0100
commitd4f75ffac8facbb14d0fe55d754db6f18b138f0a (patch)
treeb5afbe883aaca3a419eb30fd1c33f4cc84cce055
parentea3e777469a67fb3c55041df22f46f79a4f91f7c (diff)
Add timer for starting_builds
We currently have a gauge for starting_builds but actually have no knowledge about how long jobs are in the starting state. This adds a metric for this so we can see changes in the job startup time after changes in the system. Change-Id: I261f8bdc8de336967b9c8ecd6eafc68f0bfe6b78
Notes
Notes (review): Code-Review+2: Joshua Hesketh <josh@nitrotech.org> Code-Review+2: Ian Wienand <iwienand@redhat.com> Workflow+1: Ian Wienand <iwienand@redhat.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Mon, 07 Jan 2019 22:57:07 +0000 Reviewed-on: https://review.openstack.org/623468 Project: openstack-infra/zuul Branch: refs/heads/master
-rw-r--r--doc/source/admin/monitoring.rst20
-rw-r--r--releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml6
-rw-r--r--tests/unit/test_scheduler.py2
-rw-r--r--zuul/executor/server.py7
4 files changed, 31 insertions, 4 deletions
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst
index bd47f7b..6680b0f 100644
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@@ -145,10 +145,22 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
145 Incremented each time the executor starts a build. 145 Incremented each time the executor starts a build.
146 146
147 .. stat:: starting_builds 147 .. stat:: starting_builds
148 :type: gauge 148 :type: gauge, timer
149 149
150 The number of builds starting on this executor. These are 150 The number of builds starting on this executor and a timer containing
151 builds which have not yet begun their first pre-playbook. 151 how long jobs were in this state. These are builds which have not yet
152 begun their first pre-playbook.
153
154 The timer needs special thoughts when interpreting it because it
155 aggregates all jobs. It can be useful when aggregating it over a longer
156 period of time (maybe a day) where fast rising graphs could indicate e.g.
157 IO problems of the machines the executors are running on. But it has to
158 be noted that a rising graph also can indicate a higher usage of complex
159 jobs using more required projects. Also comparing several executors might
160 give insight if the graphs differ a lot from each other. Typically the
161 jobs are equally distributed over all executors (in the same zone when
162 using the zone feature) and as such the starting jobs timers (aggregated
163 over a large enough interval) should not differ much.
152 164
153 .. stat:: running_builds 165 .. stat:: running_builds
154 :type: gauge 166 :type: gauge
diff --git a/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
new file mode 100644
index 0000000..05bdddd
--- /dev/null
+++ b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
@@ -0,0 +1,6 @@
1---
2features:
3 - |
4 The executors emit a new timer
5 :stat:`zuul.executor.<executor>.starting_builds` with the time jobs spent
6 during starting.
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index 06d7b8d..24cffb2 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -161,6 +161,8 @@ class TestScheduler(ZuulTestCase):
161 exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace( 161 exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
162 '.', '_') 162 '.', '_')
163 self.assertReportedStat(exec_key + '.builds', value='1', kind='c') 163 self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
164 self.assertReportedStat(exec_key + '.starting_builds', kind='g')
165 self.assertReportedStat(exec_key + '.starting_builds', kind='ms')
164 self.assertReportedStat( 166 self.assertReportedStat(
165 'zuul.nodepool.requests.requested.total', value='1', kind='c') 167 'zuul.nodepool.requests.requested.total', value='1', kind='c')
166 self.assertReportedStat( 168 self.assertReportedStat(
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index 7956628..0917f54 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -648,6 +648,7 @@ class AnsibleJob(object):
648 self.proc_lock = threading.Lock() 648 self.proc_lock = threading.Lock()
649 self.running = False 649 self.running = False
650 self.started = False # Whether playbooks have started running 650 self.started = False # Whether playbooks have started running
651 self.time_starting_build = None
651 self.paused = False 652 self.paused = False
652 self.aborted = False 653 self.aborted = False
653 self.aborted_reason = None 654 self.aborted_reason = None
@@ -736,6 +737,7 @@ class AnsibleJob(object):
736 737
737 def execute(self): 738 def execute(self):
738 try: 739 try:
740 self.time_starting_build = time.monotonic()
739 self.ssh_agent.start() 741 self.ssh_agent.start()
740 self.ssh_agent.add(self.private_key_file) 742 self.ssh_agent.add(self.private_key_file)
741 for key in self.arguments.get('ssh_keys', []): 743 for key in self.arguments.get('ssh_keys', []):
@@ -1093,6 +1095,11 @@ class AnsibleJob(object):
1093 1095
1094 pre_failed = False 1096 pre_failed = False
1095 success = False 1097 success = False
1098 if self.executor_server.statsd:
1099 key = "zuul.executor.{hostname}.starting_builds"
1100 self.executor_server.statsd.timing(
1101 key, (time.monotonic() - self.time_starting_build) * 1000)
1102
1096 self.started = True 1103 self.started = True
1097 time_started = time.time() 1104 time_started = time.time()
1098 # timeout value is "total" job timeout which accounts for 1105 # timeout value is "total" job timeout which accounts for