Fix missing semaphore release on zk error

During problems with zk connectivity jobs can fail locking nodes
[1]. In this case the build doesn't get created and attached to the
queue item. However semaphores are already aquired at this point and
don't get released in this case. Fix this by releasing the semaphore
when hitting this exception.

[1] Trace:
2018-04-05 10:56:56,936 ERROR zuul.Pipeline.example.check: Exception while executing job example-test for change <Change 0x7f65e9dd59e8 14,55692b4a936fff57e33036399927332849a53a92>:
Traceback (most recent call last):
  File "/usr/lib/python3.6/site-packages/zuul/manager/__init__.py", line 396, in _executeJobs
    self.sched.nodepool.useNodeSet(nodeset)
  File "/usr/lib/python3.6/site-packages/zuul/nodepool.py", line 117, in useNodeSet
    self.sched.zk.storeNode(node)
  File "/usr/lib/python3.6/site-packages/zuul/zk.py", line 213, in storeNode
    self.client.set(path, self._dictToStr(node.toDict()))
  File "/usr/lib/python3.6/site-packages/kazoo/client.py", line 1242, in set
    return self.set_async(path, value, version).get()
  File "/usr/lib/python3.6/site-packages/kazoo/handlers/utils.py", line 79, in get
    raise self._exception
kazoo.exceptions.NoNodeError

Change-Id: I851876ece318aa047e523c50f4c721417d1af6b7
This commit is contained in:
Tobias Henkel 2018-04-09 16:02:45 +02:00
parent f523b1679e
commit c5e6f5cefe
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
5 changed files with 55 additions and 0 deletions

View File

@ -47,6 +47,15 @@
semaphore: test-semaphore-two
run: playbooks/semaphore-two-test2.yaml
- job:
name: semaphore-one-test3
semaphore: test-semaphore
run: playbooks/semaphore-two-test1.yaml
nodeset:
nodes:
- name: controller
label: label1
- project:
name: org/project
check:
@ -62,3 +71,9 @@
- project-test1
- semaphore-two-test1
- semaphore-two-test2
- project:
name: org/project2
check:
jobs:
- semaphore-one-test3

View File

@ -0,0 +1 @@
test

View File

@ -7,3 +7,4 @@
untrusted-projects:
- org/project
- org/project1
- org/project2

View File

@ -22,6 +22,7 @@ import os
import shutil
import time
from unittest import skip
from kazoo.exceptions import NoNodeError
import git
import testtools
@ -5442,6 +5443,35 @@ class TestSemaphore(ZuulTestCase):
self.assertEqual(A.reported, 1)
self.assertEqual(B.reported, 1)
def test_semaphore_zk_error(self):
"Test semaphore release with zk error"
tenant = self.sched.abide.tenants.get('tenant-one')
A = self.fake_gerrit.addFakeChange('org/project2', 'master', 'A')
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
# Simulate a single zk error in useNodeSet
orig_useNodeSet = self.nodepool.useNodeSet
def broken_use_nodeset(nodeset):
# restore original useNodeSet
self.nodepool.useNodeSet = orig_useNodeSet
raise NoNodeError()
self.nodepool.useNodeSet = broken_use_nodeset
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# The semaphore should be released
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
# cleanup the queue
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
def test_semaphore_abandon(self):
"Test abandon with job semaphores"
self.executor_server.hold_jobs_in_build = True

View File

@ -399,6 +399,14 @@ class PipelineManager(object):
except Exception:
self.log.exception("Exception while executing job %s "
"for change %s:" % (job, item.change))
try:
# If we hit an exception we don't have a build in the
# current item so a potentially aquired semaphore must be
# released as it won't be released on dequeue of the item.
tenant = item.pipeline.layout.tenant
tenant.semaphore_handler.release(item, job)
except Exception:
self.log.exception("Exception while releasing semaphore")
def executeJobs(self, item):
# TODO(jeblair): This should return a value indicating a job