Merge "Resume paused job with skipped children"

This commit is contained in:
Zuul 2018-11-06 16:30:03 +00:00 committed by Gerrit Code Review
commit 4d91ea3bef
9 changed files with 145 additions and 32 deletions

View File

@ -1502,6 +1502,9 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
build = self.executor_server.job_builds[self.job.unique]
if self.executor_server._run_ansible:
# Call run on the fake build omitting the result so we also can
# hold real ansible jobs.
build.run()
result = super(RecordingAnsibleJob, self).runAnsible(
cmd, timeout, playbook, wrapped)
else:

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,7 @@
- hosts: all
tasks:
- name: Pause and let child run
zuul_return:
data:
zuul:
pause: true

View File

@ -0,0 +1,4 @@
- hosts: all
tasks:
- fail:
msg: pre-test fails

View File

@ -0,0 +1,4 @@
- hosts: all
tasks:
- debug:
msg: Dummy

View File

@ -0,0 +1,22 @@
- job:
name: compile
run: playbooks/compile.yaml
- job:
name: test
run: playbooks/test.yaml
- job:
name: pre-test
run: playbooks/pre-test.yaml
- project:
check:
jobs:
- compile
- pre-test
- test:
dependencies:
- compile
- pre-test

View File

@ -6,3 +6,4 @@
- common-config
- org/project
- org/project2
- org/project3

View File

@ -4272,3 +4272,82 @@ class TestJobPause(AnsibleZuulTestCase):
self.assertHistory([
dict(name='just-pause', result='SUCCESS', changes='1,1'),
], ordered=False)
def test_job_pause_skipped_child(self):
"""
Tests that a paused job is resumed with externally skipped jobs.
Tests that this situation won't lead to stuck buildsets.
Compile pauses before pre-test fails.
1. compile (pauses) --+
|
+--> test (skipped because of pre-test)
|
2. pre-test (fails) --+
"""
self.wait_timeout = 120
self.executor_server.hold_jobs_in_build = True
# Output extra ansible info so we might see errors.
self.executor_server.verbose = True
self.executor_server.keep_jobdir = True
A = self.fake_gerrit.addFakeChange('org/project3', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.executor_server.release('compile')
self.waitUntilSettled()
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertHistory([
dict(name='pre-test', result='FAILURE', changes='1,1'),
dict(name='compile', result='SUCCESS', changes='1,1'),
])
self.assertIn('test : SKIPPED', A.messages[0])
def test_job_pause_pre_skipped_child(self):
"""
Tests that a paused job is resumed with pre-existing skipped jobs.
Tests that this situation won't lead to stuck buildsets.
The pre-test fails before compile pauses so test is already skipped
when compile pauses.
1. pre-test (fails) --+
|
+--> test (skipped because of pre-test)
|
2. compile (pauses) --+
"""
self.wait_timeout = 120
self.executor_server.hold_jobs_in_build = True
# Output extra ansible info so we might see errors.
self.executor_server.verbose = True
self.executor_server.keep_jobdir = True
A = self.fake_gerrit.addFakeChange('org/project3', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.executor_server.release('pre-test')
self.waitUntilSettled()
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertHistory([
dict(name='pre-test', result='FAILURE', changes='1,1'),
dict(name='compile', result='SUCCESS', changes='1,1'),
])
self.assertIn('test : SKIPPED', A.messages[0])

View File

@ -721,42 +721,34 @@ class PipelineManager(object):
def onBuildPaused(self, build):
item = build.build_set.item
self.log.debug("Build %s of %s paused", build, item.change)
child_jobs = item.job_graph.getDependentJobsRecursively(build.job.name)
if not child_jobs:
# if there are no child jobs we need to directly resume the build
self.log.debug("Resuming build %s of %s with no child jobs",
build, item.change)
self.sched.executor.resumeBuild(build)
item.setResult(build)
# We need to resume builds because we could either have no children
# or have children that are already skipped.
self._resumeBuilds(build.build_set)
return True
def _resumeParents(self, build):
def _resumeBuilds(self, build_set):
"""
Resumes all parent jobs where all children are finished.
Resumes all paused builds of a buildset that may be resumed.
"""
item = build.build_set.item
builds = item.current_build_set.builds
jobgraph = item.job_graph
parent_builds = [builds.get(x.name) for x in
jobgraph.getParentJobsRecursively(
build.job.name, soft=True)]
for parent_build in parent_builds:
if parent_build and parent_build.paused:
# check if all child jobs are finished
child_builds = [builds.get(x.name) for x in
jobgraph.getDependentJobsRecursively(
parent_build.job.name)]
all_completed = True
for child_build in child_builds:
if not child_build or not child_build.result:
all_completed = False
break
jobgraph = build_set.item.job_graph
for build in build_set.builds.values():
if not build.paused:
continue
# check if all child jobs are finished
child_builds = [build_set.builds.get(x.name) for x in
jobgraph.getDependentJobsRecursively(
build.job.name)]
all_completed = True
for child_build in child_builds:
if not child_build or not child_build.result:
all_completed = False
break
if all_completed:
self.sched.executor.resumeBuild(parent_build)
parent_build.paused = False
if all_completed:
self.sched.executor.resumeBuild(build)
build.paused = False
def onBuildCompleted(self, build):
item = build.build_set.item
@ -771,7 +763,7 @@ class PipelineManager(object):
if build.retry:
build.build_set.removeJobNodeSet(build.job.name)
self._resumeParents(build)
self._resumeBuilds(build.build_set)
return True
def onMergeCompleted(self, event):
@ -799,7 +791,7 @@ class PipelineManager(object):
self.log.info("Node request %s: failure for %s" %
(request, request.job.name,))
build_set.item.setNodeRequestFailure(request.job)
self._resumeParents(request)
self._resumeBuilds(request.build_set)
self.log.info("Completed node request %s for job %s of item %s "
"with nodes %s" %
(request, request.job, build_set.item,