Fix missing semaphore release on zk error
During problems with zk connectivity jobs can fail locking nodes [1]. In this case the build doesn't get created and attached to the queue item. However semaphores are already aquired at this point and don't get released in this case. Fix this by releasing the semaphore when hitting this exception. [1] Trace: 2018-04-05 10:56:56,936 ERROR zuul.Pipeline.example.check: Exception while executing job example-test for change <Change 0x7f65e9dd59e8 14,55692b4a936fff57e33036399927332849a53a92>: Traceback (most recent call last): File "/usr/lib/python3.6/site-packages/zuul/manager/__init__.py", line 396, in _executeJobs self.sched.nodepool.useNodeSet(nodeset) File "/usr/lib/python3.6/site-packages/zuul/nodepool.py", line 117, in useNodeSet self.sched.zk.storeNode(node) File "/usr/lib/python3.6/site-packages/zuul/zk.py", line 213, in storeNode self.client.set(path, self._dictToStr(node.toDict())) File "/usr/lib/python3.6/site-packages/kazoo/client.py", line 1242, in set return self.set_async(path, value, version).get() File "/usr/lib/python3.6/site-packages/kazoo/handlers/utils.py", line 79, in get raise self._exception kazoo.exceptions.NoNodeError Change-Id: I851876ece318aa047e523c50f4c721417d1af6b7
This commit is contained in:
parent
f523b1679e
commit
c5e6f5cefe
|
@ -47,6 +47,15 @@
|
|||
semaphore: test-semaphore-two
|
||||
run: playbooks/semaphore-two-test2.yaml
|
||||
|
||||
- job:
|
||||
name: semaphore-one-test3
|
||||
semaphore: test-semaphore
|
||||
run: playbooks/semaphore-two-test1.yaml
|
||||
nodeset:
|
||||
nodes:
|
||||
- name: controller
|
||||
label: label1
|
||||
|
||||
- project:
|
||||
name: org/project
|
||||
check:
|
||||
|
@ -62,3 +71,9 @@
|
|||
- project-test1
|
||||
- semaphore-two-test1
|
||||
- semaphore-two-test2
|
||||
|
||||
- project:
|
||||
name: org/project2
|
||||
check:
|
||||
jobs:
|
||||
- semaphore-one-test3
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
test
|
|
@ -7,3 +7,4 @@
|
|||
untrusted-projects:
|
||||
- org/project
|
||||
- org/project1
|
||||
- org/project2
|
||||
|
|
|
@ -22,6 +22,7 @@ import os
|
|||
import shutil
|
||||
import time
|
||||
from unittest import skip
|
||||
from kazoo.exceptions import NoNodeError
|
||||
|
||||
import git
|
||||
import testtools
|
||||
|
@ -5442,6 +5443,35 @@ class TestSemaphore(ZuulTestCase):
|
|||
self.assertEqual(A.reported, 1)
|
||||
self.assertEqual(B.reported, 1)
|
||||
|
||||
def test_semaphore_zk_error(self):
|
||||
"Test semaphore release with zk error"
|
||||
tenant = self.sched.abide.tenants.get('tenant-one')
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project2', 'master', 'A')
|
||||
self.assertFalse('test-semaphore' in
|
||||
tenant.semaphore_handler.semaphores)
|
||||
|
||||
# Simulate a single zk error in useNodeSet
|
||||
orig_useNodeSet = self.nodepool.useNodeSet
|
||||
|
||||
def broken_use_nodeset(nodeset):
|
||||
# restore original useNodeSet
|
||||
self.nodepool.useNodeSet = orig_useNodeSet
|
||||
raise NoNodeError()
|
||||
|
||||
self.nodepool.useNodeSet = broken_use_nodeset
|
||||
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
# The semaphore should be released
|
||||
self.assertFalse('test-semaphore' in
|
||||
tenant.semaphore_handler.semaphores)
|
||||
|
||||
# cleanup the queue
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
def test_semaphore_abandon(self):
|
||||
"Test abandon with job semaphores"
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
|
|
@ -399,6 +399,14 @@ class PipelineManager(object):
|
|||
except Exception:
|
||||
self.log.exception("Exception while executing job %s "
|
||||
"for change %s:" % (job, item.change))
|
||||
try:
|
||||
# If we hit an exception we don't have a build in the
|
||||
# current item so a potentially aquired semaphore must be
|
||||
# released as it won't be released on dequeue of the item.
|
||||
tenant = item.pipeline.layout.tenant
|
||||
tenant.semaphore_handler.release(item, job)
|
||||
except Exception:
|
||||
self.log.exception("Exception while releasing semaphore")
|
||||
|
||||
def executeJobs(self, item):
|
||||
# TODO(jeblair): This should return a value indicating a job
|
||||
|
|
Loading…
Reference in New Issue