Merge "Fix rare semaphore leak during reconfiguration"

This commit is contained in:
Zuul 2019-02-28 17:59:46 +00:00 committed by Gerrit Code Review
commit b12d0d6164
4 changed files with 207 additions and 3 deletions

View File

@ -0,0 +1,110 @@
- pipeline:
name: check
manager: independent
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- semaphore:
name: test-semaphore-two
max: 2
- job:
name: base
parent: null
nodeset:
nodes:
- name: controller
label: label1
- job:
name: project-test1
run: playbooks/project-test1.yaml
- job:
name: semaphore-one-test1
semaphore: test-semaphore
run: playbooks/semaphore-one-test1.yaml
- job:
name: semaphore-one-test2
semaphore: test-semaphore
run: playbooks/semaphore-one-test2.yaml
- job:
name: semaphore-two-test1
semaphore: test-semaphore-two
run: playbooks/semaphore-two-test1.yaml
- job:
name: semaphore-two-test2
semaphore: test-semaphore-two
run: playbooks/semaphore-two-test2.yaml
- job:
name: semaphore-one-test3
semaphore: test-semaphore
run: playbooks/semaphore-two-test1.yaml
nodeset:
nodes:
- name: controller
label: label1
- job:
name: semaphore-one-test1-resources-first
semaphore:
name: test-semaphore
resources-first: True
run: playbooks/semaphore-one-test1.yaml
- job:
name: semaphore-one-test2-resources-first
semaphore:
name: test-semaphore
resources-first: True
run: playbooks/semaphore-one-test1.yaml
- project:
name: org/project
check:
jobs:
- project-test1
# This is the difference to zuul.yaml that is used in
# test_semaphore_reconfigure_job_removal:
# - semaphore-one-test1
# - semaphore-one-test2
- project:
name: org/project1
check:
jobs:
- project-test1
- semaphore-two-test1
- semaphore-two-test2
- project:
name: org/project2
check:
jobs:
- semaphore-one-test3
- project:
name: org/project3
check:
jobs:
- project-test1
- semaphore-one-test1-resources-first
- semaphore-one-test2-resources-first
- project:
name: org/project4
check:
jobs:
- semaphore-one-test1-resources-first

View File

@ -6584,6 +6584,96 @@ class TestSemaphore(ZuulTestCase):
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
def test_semaphore_reconfigure_job_removal(self):
"Test job removal during reconfiguration with semaphores"
self.executor_server.hold_jobs_in_build = True
tenant = self.sched.abide.tenants.get('tenant-one')
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.assertTrue('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.commitConfigUpdate(
'common-config',
'config/semaphore/git/common-config/zuul-remove-job.yaml')
self.sched.reconfigure(self.config)
self.waitUntilSettled()
# Release job project-test1 which should be the only job left
self.executor_server.release('project-test1')
self.waitUntilSettled()
# The check pipeline should be empty
tenant = self.sched.abide.tenants.get('tenant-one')
check_pipeline = tenant.layout.pipelines['check']
items = check_pipeline.getAllItems()
self.assertEqual(len(items), 0)
# The semaphore should be released
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
def test_semaphore_reconfigure_job_removal_pending_node_request(self):
"""
Test job removal during reconfiguration with semaphores and pending
node request.
"""
self.executor_server.hold_jobs_in_build = True
# Pause nodepool so we can block the job in node request state during
# reconfiguration.
self.fake_nodepool.pause()
tenant = self.sched.abide.tenants.get('tenant-one')
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.assertTrue('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.commitConfigUpdate(
'common-config',
'config/semaphore/git/common-config/zuul-remove-job.yaml')
self.sched.reconfigure(self.config)
self.waitUntilSettled()
# Now we can unpause nodepool
self.fake_nodepool.unpause()
self.waitUntilSettled()
# Release job project-test1 which should be the only job left
self.executor_server.release('project-test1')
self.waitUntilSettled()
# The check pipeline should be empty
tenant = self.sched.abide.tenants.get('tenant-one')
check_pipeline = tenant.layout.pipelines['check']
items = check_pipeline.getAllItems()
self.assertEqual(len(items), 0)
# The semaphore should be released
self.assertFalse('test-semaphore' in
tenant.semaphore_handler.semaphores)
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
class TestSemaphoreMultiTenant(ZuulTestCase):
tenant_config_file = 'config/multi-tenant-semaphore/main.yaml'

View File

@ -836,7 +836,7 @@ class PipelineManager(object):
self.log.debug("Item %s status is now:\n %s" %
(item, item.formatStatus()))
if build.retry:
if build.retry and build.build_set.getJobNodeSet(build.job.name):
build.build_set.removeJobNodeSet(build.job.name)
self._resumeBuilds(build.build_set)

View File

@ -830,6 +830,8 @@ class Scheduler(threading.Thread):
item.current_build_set.node_requests.items():
requests_to_cancel.append(
(item.current_build_set, request))
semaphores_to_release = []
for build in builds_to_cancel:
self.log.info(
"Canceling build %s during reconfiguration" % (build,))
@ -848,14 +850,16 @@ class Scheduler(threading.Thread):
self.log.exception(
"Exception while removing nodeset from build %s "
"for change %s" % (build, build.build_set.item.change))
tenant.semaphore_handler.release(
build.build_set.item, build.job)
semaphores_to_release.append((build.build_set.item, build.job))
for build_set, request in requests_to_cancel:
self.log.info(
"Canceling node request %s during reconfiguration",
request)
self.nodepool.cancelRequest(request)
build_set.removeJobNodeRequest(request.job.name)
semaphores_to_release.append((build_set.item, request.job))
for item, job in semaphores_to_release:
tenant.semaphore_handler.release(item, job)
def _reconfigureTenant(self, tenant):
# This is called from _doReconfigureEvent while holding the