summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2019-01-07 22:57:07 +0000
committerGerrit Code Review <review@openstack.org>2019-01-07 22:57:07 +0000
commitfd53894f7388b8c26a7c3d2e9f9743e318ba23d3 (patch)
treeb2dc2ab0a25d2af03239e2dbe23729d1ebae2a6f
parent7d1c6be373446303bade18a4c27b3176bfb30a37 (diff)
parentd4f75ffac8facbb14d0fe55d754db6f18b138f0a (diff)
Merge "Add timer for starting_builds"
-rw-r--r--doc/source/admin/monitoring.rst20
-rw-r--r--releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml6
-rw-r--r--tests/unit/test_scheduler.py2
-rw-r--r--zuul/executor/server.py7
4 files changed, 31 insertions, 4 deletions
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst
index bd47f7b..6680b0f 100644
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@@ -145,10 +145,22 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
145 Incremented each time the executor starts a build. 145 Incremented each time the executor starts a build.
146 146
147 .. stat:: starting_builds 147 .. stat:: starting_builds
148 :type: gauge 148 :type: gauge, timer
149 149
150 The number of builds starting on this executor. These are 150 The number of builds starting on this executor and a timer containing
151 builds which have not yet begun their first pre-playbook. 151 how long jobs were in this state. These are builds which have not yet
152 begun their first pre-playbook.
153
154 The timer needs special thoughts when interpreting it because it
155 aggregates all jobs. It can be useful when aggregating it over a longer
156 period of time (maybe a day) where fast rising graphs could indicate e.g.
157 IO problems of the machines the executors are running on. But it has to
158 be noted that a rising graph also can indicate a higher usage of complex
159 jobs using more required projects. Also comparing several executors might
160 give insight if the graphs differ a lot from each other. Typically the
161 jobs are equally distributed over all executors (in the same zone when
162 using the zone feature) and as such the starting jobs timers (aggregated
163 over a large enough interval) should not differ much.
152 164
153 .. stat:: running_builds 165 .. stat:: running_builds
154 :type: gauge 166 :type: gauge
diff --git a/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
new file mode 100644
index 0000000..05bdddd
--- /dev/null
+++ b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
@@ -0,0 +1,6 @@
1---
2features:
3 - |
4 The executors emit a new timer
5 :stat:`zuul.executor.<executor>.starting_builds` with the time jobs spent
6 during starting.
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index 2c0b93a..0848103 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -161,6 +161,8 @@ class TestScheduler(ZuulTestCase):
161 exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace( 161 exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
162 '.', '_') 162 '.', '_')
163 self.assertReportedStat(exec_key + '.builds', value='1', kind='c') 163 self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
164 self.assertReportedStat(exec_key + '.starting_builds', kind='g')
165 self.assertReportedStat(exec_key + '.starting_builds', kind='ms')
164 self.assertReportedStat( 166 self.assertReportedStat(
165 'zuul.nodepool.requests.requested.total', value='1', kind='c') 167 'zuul.nodepool.requests.requested.total', value='1', kind='c')
166 self.assertReportedStat( 168 self.assertReportedStat(
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index 7956628..0917f54 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -648,6 +648,7 @@ class AnsibleJob(object):
648 self.proc_lock = threading.Lock() 648 self.proc_lock = threading.Lock()
649 self.running = False 649 self.running = False
650 self.started = False # Whether playbooks have started running 650 self.started = False # Whether playbooks have started running
651 self.time_starting_build = None
651 self.paused = False 652 self.paused = False
652 self.aborted = False 653 self.aborted = False
653 self.aborted_reason = None 654 self.aborted_reason = None
@@ -736,6 +737,7 @@ class AnsibleJob(object):
736 737
737 def execute(self): 738 def execute(self):
738 try: 739 try:
740 self.time_starting_build = time.monotonic()
739 self.ssh_agent.start() 741 self.ssh_agent.start()
740 self.ssh_agent.add(self.private_key_file) 742 self.ssh_agent.add(self.private_key_file)
741 for key in self.arguments.get('ssh_keys', []): 743 for key in self.arguments.get('ssh_keys', []):
@@ -1093,6 +1095,11 @@ class AnsibleJob(object):
1093 1095
1094 pre_failed = False 1096 pre_failed = False
1095 success = False 1097 success = False
1098 if self.executor_server.statsd:
1099 key = "zuul.executor.{hostname}.starting_builds"
1100 self.executor_server.statsd.timing(
1101 key, (time.monotonic() - self.time_starting_build) * 1000)
1102
1096 self.started = True 1103 self.started = True
1097 time_started = time.time() 1104 time_started = time.time()
1098 # timeout value is "total" job timeout which accounts for 1105 # timeout value is "total" job timeout which accounts for