Merge "Fix missing semaphore release on node failure"

This commit is contained in:
Zuul 2019-02-05 19:41:22 +00:00 committed by Gerrit Code Review
commit 069a550f81
5 changed files with 75 additions and 0 deletions

View File

@ -103,3 +103,9 @@
- project-test1
- semaphore-one-test1-resources-first
- semaphore-one-test2-resources-first
- project:
name: org/project4
check:
jobs:
- semaphore-one-test1-resources-first

View File

@ -0,0 +1 @@
test

View File

@ -9,3 +9,4 @@
- org/project1
- org/project2
- org/project3
- org/project4

View File

@ -6251,6 +6251,38 @@ class TestSemaphore(ZuulTestCase):
self.assertEqual(A.reported, 1)
self.assertEqual(B.reported, 1)
def test_semaphore_node_failure(self):
"Test semaphore and node failure"
tenant = self.sched.abide.tenants.get('tenant-one')
# Pause nodepool so we can fail the node request later
self.fake_nodepool.pause()
A = self.fake_gerrit.addFakeChange('org/project2', 'master', 'A')
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# By default we first lock the semaphore and then get the nodes
# so at this point the semaphore needs to be aquired.
self.assertTrue('test-semaphore' in
tenant.semaphore_handler.semaphores)
# Fail the node request and unpause
req = self.fake_nodepool.getNodeRequests()[0]
self.fake_nodepool.addFailRequest(req)
self.fake_nodepool.unpause()
self.waitUntilSettled()
# At this point the job that holds the semaphore failed with
# node_failure and the semaphore must be released.
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.assertEquals(1, A.reported)
self.assertIn('semaphore-one-test3 semaphore-one-test3 : NODE_FAILURE',
A.messages[0])
def test_semaphore_resources_first(self):
"Test semaphores with max=1 (mutex) and get resources first"
tenant = self.sched.abide.tenants.get('tenant-one')
@ -6298,6 +6330,38 @@ class TestSemaphore(ZuulTestCase):
self.executor_server.release()
self.waitUntilSettled()
def test_semaphore_resources_first_node_failure(self):
"Test semaphore and node failure"
tenant = self.sched.abide.tenants.get('tenant-one')
# Pause nodepool so we can fail the node request later
self.fake_nodepool.pause()
A = self.fake_gerrit.addFakeChange('org/project4', 'master', 'A')
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# With resources first we first get the nodes so at this point the
# semaphore must not be aquired.
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
# Fail the node request and unpause
req = self.fake_nodepool.getNodeRequests()[0]
self.fake_nodepool.addFailRequest(req)
self.fake_nodepool.unpause()
self.waitUntilSettled()
# At this point the job should never have acuired a semaphore so check
# that it still has not locked a semaphore.
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.assertEquals(1, A.reported)
self.assertIn('semaphore-one-test1-resources-first : NODE_FAILURE',
A.messages[0])
def test_semaphore_zk_error(self):
"Test semaphore release with zk error"
tenant = self.sched.abide.tenants.get('tenant-one')

View File

@ -864,6 +864,9 @@ class PipelineManager(object):
(request, request.job.name,))
build_set.item.setNodeRequestFailure(request.job)
self._resumeBuilds(request.build_set)
tenant = build_set.item.pipeline.tenant
tenant.semaphore_handler.release(build_set.item, request.job)
self.log.info("Completed node request %s for job %s of item %s "
"with nodes %s" %
(request, request.job, build_set.item,