summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames E. Blair <jeblair@redhat.com>2019-01-03 12:50:35 -0800
committerJames E. Blair <jeblair@redhat.com>2019-01-03 12:50:35 -0800
commit31a7ddc7916a3e9c88e73fd52124fa6c1720ca26 (patch)
tree2bbac5860c6423f3b7812f494dcb3b5724f30b48
parent3ddce68728eeb0a00e9647feab2f53a14918eceb (diff)
Fix items stuck in queue pending node requests
Given the following sequence: * An item is within the active window * A node request is submitted for a job for that item * The active window shrinks * A tenant reconfiguration is performed * The node request completes The nodes would be returned, however the record of the node request was not removed from the item's buildset. Once the item entered the active window again, no further node requests would happen for that job because of the retained node request record, and therefore the item would be stuck in the queue. To correct this, discard the node request record on the buildset if we are going to return the nodes unused in that case. Also clarify a debug message "job set" -> "job nodeset", and correct some typos from a related unit test which were inadvertently simply asserting true. Change-Id: Ia6259b08ecda49f3123dae1c7d0e35dc46004561
Notes
Notes (review): Code-Review+2: Monty Taylor <mordred@inaugust.com> Code-Review+2: Clark Boylan <cboylan@sapwetik.org> Workflow+1: Clark Boylan <cboylan@sapwetik.org> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Thu, 03 Jan 2019 23:00:24 +0000 Reviewed-on: https://review.openstack.org/628300 Project: openstack-infra/zuul Branch: refs/heads/master
-rw-r--r--tests/unit/test_scheduler.py75
-rw-r--r--zuul/model.py7
-rw-r--r--zuul/scheduler.py6
3 files changed, 84 insertions, 4 deletions
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index 06d7b8d..2c0b93a 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -4484,7 +4484,7 @@ class TestScheduler(ZuulTestCase):
4484 tenant = self.sched.abide.tenants.get('tenant-one') 4484 tenant = self.sched.abide.tenants.get('tenant-one')
4485 queue = tenant.layout.pipelines['gate'].queues[0] 4485 queue = tenant.layout.pipelines['gate'].queues[0]
4486 self.assertEqual(queue.window, 2) 4486 self.assertEqual(queue.window, 2)
4487 self.assertTrue(len(self.builds), 4) 4487 self.assertEqual(len(self.builds), 4)
4488 4488
4489 self.waitUntilSettled() 4489 self.waitUntilSettled()
4490 self.commitConfigUpdate('org/common-config', 4490 self.commitConfigUpdate('org/common-config',
@@ -4499,7 +4499,7 @@ class TestScheduler(ZuulTestCase):
4499 # B is outside the window, but still marked active until the 4499 # B is outside the window, but still marked active until the
4500 # next pass through the queue processor, so its builds haven't 4500 # next pass through the queue processor, so its builds haven't
4501 # been canceled. 4501 # been canceled.
4502 self.assertTrue(len(self.builds), 4) 4502 self.assertEqual(len(self.builds), 4)
4503 4503
4504 self.sched.reconfigure(self.config) 4504 self.sched.reconfigure(self.config)
4505 tenant = self.sched.abide.tenants.get('tenant-one') 4505 tenant = self.sched.abide.tenants.get('tenant-one')
@@ -4507,7 +4507,7 @@ class TestScheduler(ZuulTestCase):
4507 self.assertEqual(queue.window, 1) 4507 self.assertEqual(queue.window, 1)
4508 self.waitUntilSettled() 4508 self.waitUntilSettled()
4509 # B's builds have been canceled now 4509 # B's builds have been canceled now
4510 self.assertTrue(len(self.builds), 2) 4510 self.assertEqual(len(self.builds), 2)
4511 4511
4512 self.executor_server.hold_jobs_in_build = False 4512 self.executor_server.hold_jobs_in_build = False
4513 self.executor_server.release() 4513 self.executor_server.release()
@@ -4524,6 +4524,75 @@ class TestScheduler(ZuulTestCase):
4524 dict(name='job2', result='SUCCESS', changes='1,1 2,1'), 4524 dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
4525 ], ordered=False) 4525 ], ordered=False)
4526 4526
4527 @simple_layout('layouts/reconfigure-window-fixed.yaml')
4528 def test_reconfigure_window_fixed_requests(self):
4529 # Test the active window shrinking during reconfiguration with
4530 # outstanding node requests
4531 self.executor_server.hold_jobs_in_build = True
4532
4533 # Start the jobs for A
4534 A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
4535 A.addApproval('Code-Review', 2)
4536 self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
4537 self.waitUntilSettled()
4538 self.log.debug("A complete")
4539
4540 # Hold node requests for B
4541 self.fake_nodepool.pause()
4542 B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
4543 B.addApproval('Code-Review', 2)
4544 self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
4545 self.waitUntilSettled()
4546 self.log.debug("B complete")
4547
4548 tenant = self.sched.abide.tenants.get('tenant-one')
4549 queue = tenant.layout.pipelines['gate'].queues[0]
4550 self.assertEqual(queue.window, 2)
4551 self.assertEqual(len(self.builds), 2)
4552
4553 self.waitUntilSettled()
4554 self.commitConfigUpdate('org/common-config',
4555 'layouts/reconfigure-window-fixed2.yaml')
4556 self.sched.reconfigure(self.config)
4557 self.waitUntilSettled()
4558 self.log.debug("Reconfiguration complete")
4559
4560 tenant = self.sched.abide.tenants.get('tenant-one')
4561 queue = tenant.layout.pipelines['gate'].queues[0]
4562 # Because we have configured a static window, it should
4563 # be allowed to shrink on reconfiguration.
4564 self.assertEqual(queue.window, 1)
4565 self.assertEqual(len(self.builds), 2)
4566
4567 # After the previous reconfig, the queue processor will have
4568 # run and marked B inactive; run another reconfiguration so
4569 # that we're testing what happens when we reconfigure after
4570 # the active window having shrunk.
4571 self.sched.reconfigure(self.config)
4572
4573 # Unpause the node requests now
4574 self.fake_nodepool.unpause()
4575 self.waitUntilSettled()
4576 self.log.debug("Nodepool unpause complete")
4577
4578 # Allow A to merge and B to enter the active window and complete
4579 self.executor_server.hold_jobs_in_build = False
4580 self.executor_server.release()
4581 self.waitUntilSettled()
4582 self.log.debug("Executor unpause complete")
4583
4584 tenant = self.sched.abide.tenants.get('tenant-one')
4585 queue = tenant.layout.pipelines['gate'].queues[0]
4586 self.assertEqual(queue.window, 1)
4587
4588 self.waitUntilSettled()
4589 self.assertHistory([
4590 dict(name='job1', result='SUCCESS', changes='1,1'),
4591 dict(name='job2', result='SUCCESS', changes='1,1'),
4592 dict(name='job1', result='SUCCESS', changes='1,1 2,1'),
4593 dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
4594 ], ordered=False)
4595
4527 @simple_layout('layouts/reconfigure-remove-add.yaml') 4596 @simple_layout('layouts/reconfigure-remove-add.yaml')
4528 def test_reconfigure_remove_add(self): 4597 def test_reconfigure_remove_add(self):
4529 # Test removing, then adding a job while in queue 4598 # Test removing, then adding a job while in queue
diff --git a/zuul/model.py b/zuul/model.py
index 3b972b9..09726ac 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -1849,7 +1849,7 @@ class BuildSet(object):
1849 1849
1850 def removeJobNodeSet(self, job_name): 1850 def removeJobNodeSet(self, job_name):
1851 if job_name not in self.nodesets: 1851 if job_name not in self.nodesets:
1852 raise Exception("No job set for %s" % (job_name)) 1852 raise Exception("No job nodeset for %s" % (job_name))
1853 del self.nodesets[job_name] 1853 del self.nodesets[job_name]
1854 1854
1855 def setJobNodeRequest(self, job_name, req): 1855 def setJobNodeRequest(self, job_name, req):
@@ -1860,6 +1860,11 @@ class BuildSet(object):
1860 def getJobNodeRequest(self, job_name): 1860 def getJobNodeRequest(self, job_name):
1861 return self.node_requests.get(job_name) 1861 return self.node_requests.get(job_name)
1862 1862
1863 def removeJobNodeRequest(self, job_name):
1864 if job_name not in self.node_requests:
1865 raise Exception("No node request for %s" % (job_name))
1866 del self.node_requests[job_name]
1867
1863 def jobNodeRequestComplete(self, job_name, req, nodeset): 1868 def jobNodeRequestComplete(self, job_name, req, nodeset):
1864 if job_name in self.nodesets: 1869 if job_name in self.nodesets:
1865 raise Exception("Prior node request for %s" % (job_name)) 1870 raise Exception("Prior node request for %s" % (job_name))
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 0be456e..0303429 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -1313,6 +1313,12 @@ class Scheduler(threading.Thread):
1313 self.log.warning("Item %s does not contain job %s " 1313 self.log.warning("Item %s does not contain job %s "
1314 "for node request %s", 1314 "for node request %s",
1315 build_set.item, request.job.name, request) 1315 build_set.item, request.job.name, request)
1316 try:
1317 build_set.removeJobNodeRequest(request.job.name)
1318 except Exception:
1319 self.log.exception("Unable to remove obsolete node request "
1320 "%s for %s job %s",
1321 request, build_set.item, request.job.name)
1316 if request.fulfilled: 1322 if request.fulfilled:
1317 self.nodepool.returnNodeSet(request.nodeset) 1323 self.nodepool.returnNodeSet(request.nodeset)
1318 return 1324 return