Resume paused job with skipped children
Currently there are two corner cases where we can miss to resume a paused job. In the problematic scenario we have a job that pauses (compile), a job that is independent of 'compile' (pre-test) and a job that depends on both (test). The first corner case is 'pre-test' fails before 'compile' pauses. This leads to 'test' already skipped when 'compile' pauses. This is currently not catched because a paused job only directly resumes if there are no child jobs which was based under the wrong assumption that there can be no build result prior to 'compile'. The second corner case is that 'pre-test' fails after 'compile' pauses. In this case it resumes its own parent jobs (that don't exist) but not the parent jobs of the jobs that get marked as skipped. Both cases can be solved by not resuming parent builds of builds but just iterate over the buildset and resume any paused job that is allowed to be resumed. This also makes it possible to remove the special resume behavior if the paused job has no children as this case is now handled by the common resume handling anyway. Change-Id: If1a9e62d1b3d1782eefef8adfb2ef1a7ba75f2a1
This commit is contained in:
parent
9c2b0a9bbe
commit
a239ac23a7
|
@ -1502,6 +1502,9 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
|
|||
build = self.executor_server.job_builds[self.job.unique]
|
||||
|
||||
if self.executor_server._run_ansible:
|
||||
# Call run on the fake build omitting the result so we also can
|
||||
# hold real ansible jobs.
|
||||
build.run()
|
||||
result = super(RecordingAnsibleJob, self).runAnsible(
|
||||
cmd, timeout, playbook, wrapped)
|
||||
else:
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
test
|
|
@ -0,0 +1,7 @@
|
|||
- hosts: all
|
||||
tasks:
|
||||
- name: Pause and let child run
|
||||
zuul_return:
|
||||
data:
|
||||
zuul:
|
||||
pause: true
|
|
@ -0,0 +1,4 @@
|
|||
- hosts: all
|
||||
tasks:
|
||||
- fail:
|
||||
msg: pre-test fails
|
|
@ -0,0 +1,4 @@
|
|||
- hosts: all
|
||||
tasks:
|
||||
- debug:
|
||||
msg: Dummy
|
|
@ -0,0 +1,22 @@
|
|||
- job:
|
||||
name: compile
|
||||
run: playbooks/compile.yaml
|
||||
|
||||
- job:
|
||||
name: test
|
||||
run: playbooks/test.yaml
|
||||
|
||||
- job:
|
||||
name: pre-test
|
||||
run: playbooks/pre-test.yaml
|
||||
|
||||
|
||||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- compile
|
||||
- pre-test
|
||||
- test:
|
||||
dependencies:
|
||||
- compile
|
||||
- pre-test
|
|
@ -6,3 +6,4 @@
|
|||
- common-config
|
||||
- org/project
|
||||
- org/project2
|
||||
- org/project3
|
||||
|
|
|
@ -4236,3 +4236,82 @@ class TestJobPause(AnsibleZuulTestCase):
|
|||
self.assertHistory([
|
||||
dict(name='just-pause', result='SUCCESS', changes='1,1'),
|
||||
], ordered=False)
|
||||
|
||||
def test_job_pause_skipped_child(self):
|
||||
"""
|
||||
Tests that a paused job is resumed with externally skipped jobs.
|
||||
|
||||
Tests that this situation won't lead to stuck buildsets.
|
||||
Compile pauses before pre-test fails.
|
||||
|
||||
1. compile (pauses) --+
|
||||
|
|
||||
+--> test (skipped because of pre-test)
|
||||
|
|
||||
2. pre-test (fails) --+
|
||||
"""
|
||||
self.wait_timeout = 120
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
||||
# Output extra ansible info so we might see errors.
|
||||
self.executor_server.verbose = True
|
||||
self.executor_server.keep_jobdir = True
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project3', 'master', 'A')
|
||||
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.executor_server.release('compile')
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.assertHistory([
|
||||
dict(name='pre-test', result='FAILURE', changes='1,1'),
|
||||
dict(name='compile', result='SUCCESS', changes='1,1'),
|
||||
])
|
||||
|
||||
self.assertIn('test : SKIPPED', A.messages[0])
|
||||
|
||||
def test_job_pause_pre_skipped_child(self):
|
||||
"""
|
||||
Tests that a paused job is resumed with pre-existing skipped jobs.
|
||||
|
||||
Tests that this situation won't lead to stuck buildsets.
|
||||
The pre-test fails before compile pauses so test is already skipped
|
||||
when compile pauses.
|
||||
|
||||
1. pre-test (fails) --+
|
||||
|
|
||||
+--> test (skipped because of pre-test)
|
||||
|
|
||||
2. compile (pauses) --+
|
||||
"""
|
||||
self.wait_timeout = 120
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
||||
# Output extra ansible info so we might see errors.
|
||||
self.executor_server.verbose = True
|
||||
self.executor_server.keep_jobdir = True
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project3', 'master', 'A')
|
||||
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.executor_server.release('pre-test')
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.assertHistory([
|
||||
dict(name='pre-test', result='FAILURE', changes='1,1'),
|
||||
dict(name='compile', result='SUCCESS', changes='1,1'),
|
||||
])
|
||||
|
||||
self.assertIn('test : SKIPPED', A.messages[0])
|
||||
|
|
|
@ -721,42 +721,34 @@ class PipelineManager(object):
|
|||
def onBuildPaused(self, build):
|
||||
item = build.build_set.item
|
||||
self.log.debug("Build %s of %s paused", build, item.change)
|
||||
|
||||
child_jobs = item.job_graph.getDependentJobsRecursively(build.job.name)
|
||||
if not child_jobs:
|
||||
# if there are no child jobs we need to directly resume the build
|
||||
self.log.debug("Resuming build %s of %s with no child jobs",
|
||||
build, item.change)
|
||||
self.sched.executor.resumeBuild(build)
|
||||
|
||||
item.setResult(build)
|
||||
|
||||
# We need to resume builds because we could either have no children
|
||||
# or have children that are already skipped.
|
||||
self._resumeBuilds(build.build_set)
|
||||
return True
|
||||
|
||||
def _resumeParents(self, build):
|
||||
def _resumeBuilds(self, build_set):
|
||||
"""
|
||||
Resumes all parent jobs where all children are finished.
|
||||
Resumes all paused builds of a buildset that may be resumed.
|
||||
"""
|
||||
item = build.build_set.item
|
||||
builds = item.current_build_set.builds
|
||||
jobgraph = item.job_graph
|
||||
parent_builds = [builds.get(x.name) for x in
|
||||
jobgraph.getParentJobsRecursively(
|
||||
build.job.name, soft=True)]
|
||||
for parent_build in parent_builds:
|
||||
if parent_build and parent_build.paused:
|
||||
# check if all child jobs are finished
|
||||
child_builds = [builds.get(x.name) for x in
|
||||
jobgraph.getDependentJobsRecursively(
|
||||
parent_build.job.name)]
|
||||
all_completed = True
|
||||
for child_build in child_builds:
|
||||
if not child_build or not child_build.result:
|
||||
all_completed = False
|
||||
break
|
||||
jobgraph = build_set.item.job_graph
|
||||
for build in build_set.builds.values():
|
||||
if not build.paused:
|
||||
continue
|
||||
# check if all child jobs are finished
|
||||
child_builds = [build_set.builds.get(x.name) for x in
|
||||
jobgraph.getDependentJobsRecursively(
|
||||
build.job.name)]
|
||||
all_completed = True
|
||||
for child_build in child_builds:
|
||||
if not child_build or not child_build.result:
|
||||
all_completed = False
|
||||
break
|
||||
|
||||
if all_completed:
|
||||
self.sched.executor.resumeBuild(parent_build)
|
||||
parent_build.paused = False
|
||||
if all_completed:
|
||||
self.sched.executor.resumeBuild(build)
|
||||
build.paused = False
|
||||
|
||||
def onBuildCompleted(self, build):
|
||||
item = build.build_set.item
|
||||
|
@ -771,7 +763,7 @@ class PipelineManager(object):
|
|||
if build.retry:
|
||||
build.build_set.removeJobNodeSet(build.job.name)
|
||||
|
||||
self._resumeParents(build)
|
||||
self._resumeBuilds(build.build_set)
|
||||
return True
|
||||
|
||||
def onMergeCompleted(self, event):
|
||||
|
@ -799,7 +791,7 @@ class PipelineManager(object):
|
|||
self.log.info("Node request %s: failure for %s" %
|
||||
(request, request.job.name,))
|
||||
build_set.item.setNodeRequestFailure(request.job)
|
||||
self._resumeParents(request)
|
||||
self._resumeBuilds(request.build_set)
|
||||
self.log.info("Completed node request %s for job %s of item %s "
|
||||
"with nodes %s" %
|
||||
(request, request.job, build_set.item,
|
||||
|
|
Loading…
Reference in New Issue