summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames E. Blair <jeblair@redhat.com>2019-01-03 13:06:39 -0800
committerJames E. Blair <jeblair@redhat.com>2019-01-03 14:07:30 -0800
commitbb9ad84cb60c3b6132e4d74f5eb0116637215ef8 (patch)
treead7bf416d63017602caca4a57a6af159aec1aa1e
parent31a7ddc7916a3e9c88e73fd52124fa6c1720ca26 (diff)
Be more aggressive in canceling node requests
During a reconfiguration, we may cancel builds if they are no longer necessary, however we do not do the same for node requests. Currently we let them run to completion and then return them unused. It would be more efficient (and behaviorally consistent) to cancel the request as soon as we determine that it won't be used. Also, change some warning log messages to info (as they don't indicate a situation which may benefit from corrective action). Change-Id: Ic3ef6b75437bf82bf9c8f426b23ea14d9aaa96b7
Notes
Notes (review): Code-Review+2: Monty Taylor <mordred@inaugust.com> Code-Review+2: Clark Boylan <cboylan@sapwetik.org> Workflow+1: Clark Boylan <cboylan@sapwetik.org> Code-Review+2: Jeremy Stanley <fungi@yuggoth.org> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Thu, 03 Jan 2019 23:03:26 +0000 Reviewed-on: https://review.openstack.org/628301 Project: openstack-infra/zuul Branch: refs/heads/master
-rw-r--r--zuul/model.py5
-rw-r--r--zuul/scheduler.py28
2 files changed, 22 insertions, 11 deletions
diff --git a/zuul/model.py b/zuul/model.py
index 09726ac..9899c92 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -1861,9 +1861,8 @@ class BuildSet(object):
1861 return self.node_requests.get(job_name) 1861 return self.node_requests.get(job_name)
1862 1862
1863 def removeJobNodeRequest(self, job_name): 1863 def removeJobNodeRequest(self, job_name):
1864 if job_name not in self.node_requests: 1864 if job_name in self.node_requests:
1865 raise Exception("No node request for %s" % (job_name)) 1865 del self.node_requests[job_name]
1866 del self.node_requests[job_name]
1867 1866
1868 def jobNodeRequestComplete(self, job_name, req, nodeset): 1867 def jobNodeRequestComplete(self, job_name, req, nodeset):
1869 if job_name in self.nodesets: 1868 if job_name in self.nodesets:
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 0303429..0a4bcee 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -770,6 +770,7 @@ class Scheduler(threading.Thread):
770 new_pipeline.window_floor) 770 new_pipeline.window_floor)
771 items_to_remove = [] 771 items_to_remove = []
772 builds_to_cancel = [] 772 builds_to_cancel = []
773 requests_to_cancel = []
773 last_head = None 774 last_head = None
774 for shared_queue in old_pipeline.queues: 775 for shared_queue in old_pipeline.queues:
775 # Attempt to keep window sizes from shrinking where possible 776 # Attempt to keep window sizes from shrinking where possible
@@ -812,15 +813,25 @@ class Scheduler(threading.Thread):
812 else: 813 else:
813 item.removeBuild(build) 814 item.removeBuild(build)
814 builds_to_cancel.append(build) 815 builds_to_cancel.append(build)
816 for request_job, request in \
817 item.current_build_set.node_requests.items():
818 new_job = item.getJob(request_job)
819 if not new_job:
820 requests_to_cancel.append(
821 (item.current_build_set, request))
815 else: 822 else:
816 items_to_remove.append(item) 823 items_to_remove.append(item)
817 for item in items_to_remove: 824 for item in items_to_remove:
818 self.log.warning( 825 self.log.info(
819 "Removing item %s during reconfiguration" % (item,)) 826 "Removing item %s during reconfiguration" % (item,))
820 for build in item.current_build_set.getBuilds(): 827 for build in item.current_build_set.getBuilds():
821 builds_to_cancel.append(build) 828 builds_to_cancel.append(build)
829 for request_job, request in \
830 item.current_build_set.node_requests.items():
831 requests_to_cancel.append(
832 (item.current_build_set, request))
822 for build in builds_to_cancel: 833 for build in builds_to_cancel:
823 self.log.warning( 834 self.log.info(
824 "Canceling build %s during reconfiguration" % (build,)) 835 "Canceling build %s during reconfiguration" % (build,))
825 try: 836 try:
826 self.executor.cancel(build) 837 self.executor.cancel(build)
@@ -839,6 +850,12 @@ class Scheduler(threading.Thread):
839 "for change %s" % (build, build.build_set.item.change)) 850 "for change %s" % (build, build.build_set.item.change))
840 tenant.semaphore_handler.release( 851 tenant.semaphore_handler.release(
841 build.build_set.item, build.job) 852 build.build_set.item, build.job)
853 for build_set, request in requests_to_cancel:
854 self.log.info(
855 "Canceling node request %s during reconfiguration",
856 request)
857 self.nodepool.cancelRequest(request)
858 build_set.removeJobNodeRequest(request.job.name)
842 859
843 def _reconfigureTenant(self, tenant): 860 def _reconfigureTenant(self, tenant):
844 # This is called from _doReconfigureEvent while holding the 861 # This is called from _doReconfigureEvent while holding the
@@ -1313,12 +1330,7 @@ class Scheduler(threading.Thread):
1313 self.log.warning("Item %s does not contain job %s " 1330 self.log.warning("Item %s does not contain job %s "
1314 "for node request %s", 1331 "for node request %s",
1315 build_set.item, request.job.name, request) 1332 build_set.item, request.job.name, request)
1316 try: 1333 build_set.removeJobNodeRequest(request.job.name)
1317 build_set.removeJobNodeRequest(request.job.name)
1318 except Exception:
1319 self.log.exception("Unable to remove obsolete node request "
1320 "%s for %s job %s",
1321 request, build_set.item, request.job.name)
1322 if request.fulfilled: 1334 if request.fulfilled:
1323 self.nodepool.returnNodeSet(request.nodeset) 1335 self.nodepool.returnNodeSet(request.nodeset)
1324 return 1336 return