Resume paused job with skipped children

Currently there are two corner cases where we can miss to resume a
paused job. In the problematic scenario we have a job that pauses
(compile), a job that is independent of 'compile' (pre-test) and a job
that depends on both (test).

The first corner case is 'pre-test' fails before 'compile'
pauses. This leads to 'test' already skipped when 'compile'
pauses. This is currently not catched because a paused job only
directly resumes if there are no child jobs which was based under the
wrong assumption that there can be no build result prior to 'compile'.

The second corner case is that 'pre-test' fails after 'compile'
pauses. In this case it resumes its own parent jobs (that don't exist)
but not the parent jobs of the jobs that get marked as skipped.

Both cases can be solved by not resuming parent builds of builds but
just iterate over the buildset and resume any paused job that is
allowed to be resumed. This also makes it possible to remove the
special resume behavior if the paused job has no children as this case
is now handled by the common resume handling anyway.

Change-Id: If1a9e62d1b3d1782eefef8adfb2ef1a7ba75f2a1
This commit is contained in:
Tobias Henkel 2018-11-05 09:04:47 +01:00
parent 9c2b0a9bbe
commit a239ac23a7
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
9 changed files with 145 additions and 32 deletions

View File

@ -1502,6 +1502,9 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
build = self.executor_server.job_builds[self.job.unique]
if self.executor_server._run_ansible:
# Call run on the fake build omitting the result so we also can
# hold real ansible jobs.
build.run()
result = super(RecordingAnsibleJob, self).runAnsible(
cmd, timeout, playbook, wrapped)
else:

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,7 @@
- hosts: all
tasks:
- name: Pause and let child run
zuul_return:
data:
zuul:
pause: true

View File

@ -0,0 +1,4 @@
- hosts: all
tasks:
- fail:
msg: pre-test fails

View File

@ -0,0 +1,4 @@
- hosts: all
tasks:
- debug:
msg: Dummy

View File

@ -0,0 +1,22 @@
- job:
name: compile
run: playbooks/compile.yaml
- job:
name: test
run: playbooks/test.yaml
- job:
name: pre-test
run: playbooks/pre-test.yaml
- project:
check:
jobs:
- compile
- pre-test
- test:
dependencies:
- compile
- pre-test

View File

@ -6,3 +6,4 @@
- common-config
- org/project
- org/project2
- org/project3

View File

@ -4236,3 +4236,82 @@ class TestJobPause(AnsibleZuulTestCase):
self.assertHistory([
dict(name='just-pause', result='SUCCESS', changes='1,1'),
], ordered=False)
def test_job_pause_skipped_child(self):
"""
Tests that a paused job is resumed with externally skipped jobs.
Tests that this situation won't lead to stuck buildsets.
Compile pauses before pre-test fails.
1. compile (pauses) --+
|
+--> test (skipped because of pre-test)
|
2. pre-test (fails) --+
"""
self.wait_timeout = 120
self.executor_server.hold_jobs_in_build = True
# Output extra ansible info so we might see errors.
self.executor_server.verbose = True
self.executor_server.keep_jobdir = True
A = self.fake_gerrit.addFakeChange('org/project3', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.executor_server.release('compile')
self.waitUntilSettled()
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertHistory([
dict(name='pre-test', result='FAILURE', changes='1,1'),
dict(name='compile', result='SUCCESS', changes='1,1'),
])
self.assertIn('test : SKIPPED', A.messages[0])
def test_job_pause_pre_skipped_child(self):
"""
Tests that a paused job is resumed with pre-existing skipped jobs.
Tests that this situation won't lead to stuck buildsets.
The pre-test fails before compile pauses so test is already skipped
when compile pauses.
1. pre-test (fails) --+
|
+--> test (skipped because of pre-test)
|
2. compile (pauses) --+
"""
self.wait_timeout = 120
self.executor_server.hold_jobs_in_build = True
# Output extra ansible info so we might see errors.
self.executor_server.verbose = True
self.executor_server.keep_jobdir = True
A = self.fake_gerrit.addFakeChange('org/project3', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.executor_server.release('pre-test')
self.waitUntilSettled()
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertHistory([
dict(name='pre-test', result='FAILURE', changes='1,1'),
dict(name='compile', result='SUCCESS', changes='1,1'),
])
self.assertIn('test : SKIPPED', A.messages[0])

View File

@ -721,42 +721,34 @@ class PipelineManager(object):
def onBuildPaused(self, build):
item = build.build_set.item
self.log.debug("Build %s of %s paused", build, item.change)
child_jobs = item.job_graph.getDependentJobsRecursively(build.job.name)
if not child_jobs:
# if there are no child jobs we need to directly resume the build
self.log.debug("Resuming build %s of %s with no child jobs",
build, item.change)
self.sched.executor.resumeBuild(build)
item.setResult(build)
# We need to resume builds because we could either have no children
# or have children that are already skipped.
self._resumeBuilds(build.build_set)
return True
def _resumeParents(self, build):
def _resumeBuilds(self, build_set):
"""
Resumes all parent jobs where all children are finished.
Resumes all paused builds of a buildset that may be resumed.
"""
item = build.build_set.item
builds = item.current_build_set.builds
jobgraph = item.job_graph
parent_builds = [builds.get(x.name) for x in
jobgraph.getParentJobsRecursively(
build.job.name, soft=True)]
for parent_build in parent_builds:
if parent_build and parent_build.paused:
# check if all child jobs are finished
child_builds = [builds.get(x.name) for x in
jobgraph.getDependentJobsRecursively(
parent_build.job.name)]
all_completed = True
for child_build in child_builds:
if not child_build or not child_build.result:
all_completed = False
break
jobgraph = build_set.item.job_graph
for build in build_set.builds.values():
if not build.paused:
continue
# check if all child jobs are finished
child_builds = [build_set.builds.get(x.name) for x in
jobgraph.getDependentJobsRecursively(
build.job.name)]
all_completed = True
for child_build in child_builds:
if not child_build or not child_build.result:
all_completed = False
break
if all_completed:
self.sched.executor.resumeBuild(parent_build)
parent_build.paused = False
if all_completed:
self.sched.executor.resumeBuild(build)
build.paused = False
def onBuildCompleted(self, build):
item = build.build_set.item
@ -771,7 +763,7 @@ class PipelineManager(object):
if build.retry:
build.build_set.removeJobNodeSet(build.job.name)
self._resumeParents(build)
self._resumeBuilds(build.build_set)
return True
def onMergeCompleted(self, event):
@ -799,7 +791,7 @@ class PipelineManager(object):
self.log.info("Node request %s: failure for %s" %
(request, request.job.name,))
build_set.item.setNodeRequestFailure(request.job)
self._resumeParents(request)
self._resumeBuilds(request.build_set)
self.log.info("Completed node request %s for job %s of item %s "
"with nodes %s" %
(request, request.job, build_set.item,