Merge "Fix unreachable nodes detection"
This commit is contained in:
commit
a477550715
|
@ -5,6 +5,7 @@
|
|||
command: ansible-playbook src/git.openstack.org/openstack-infra/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml
|
||||
environment:
|
||||
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
|
||||
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
|
||||
ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
|
||||
|
||||
- name: Run ansible playbook that should fail
|
||||
|
@ -13,6 +14,7 @@
|
|||
failed_when: "failed_results.rc != 2"
|
||||
environment:
|
||||
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
|
||||
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
|
||||
ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
|
||||
|
||||
- name: Validate output - setupvar
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Jobs that encountered unreachable nodes are now correctly detected and
|
||||
retried.
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
- job:
|
||||
name: no-log-unreachable
|
||||
attempts: 1
|
||||
run: playbooks/no-log-unreachable.yaml
|
||||
|
||||
- project:
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
- hosts: localhost
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Test
|
||||
debug:
|
||||
msg: Test
|
28
tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/unreachable.yaml
vendored
Normal file
28
tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/unreachable.yaml
vendored
Normal file
|
@ -0,0 +1,28 @@
|
|||
- hosts: localhost
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Add a fake host
|
||||
add_host:
|
||||
hostname: fake
|
||||
ansible_host: notexisting.example.notexisting
|
||||
|
||||
- hosts: fake
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Run a lineinfile task
|
||||
vars:
|
||||
logins:
|
||||
- machine: foo
|
||||
login: bar
|
||||
password: my-very-secret-password-1
|
||||
- machine: two
|
||||
login: three
|
||||
password: my-very-secret-password-2
|
||||
lineinfile:
|
||||
path: /tmp/.netrc
|
||||
mode: 0600
|
||||
create: true
|
||||
insertafter: EOF
|
||||
line: "machine {{ item.machine }} login {{ item.login }} password {{ item.password }}"
|
||||
with_items: "{{ logins }}"
|
||||
no_log: true
|
|
@ -0,0 +1,35 @@
|
|||
- pipeline:
|
||||
name: check
|
||||
manager: independent
|
||||
post-review: true
|
||||
trigger:
|
||||
gerrit:
|
||||
- event: patchset-created
|
||||
success:
|
||||
gerrit:
|
||||
Verified: 1
|
||||
failure:
|
||||
gerrit:
|
||||
Verified: -1
|
||||
|
||||
- job:
|
||||
name: base
|
||||
parent: null
|
||||
|
||||
- job:
|
||||
name: pre-unreachable
|
||||
attempts: 2
|
||||
pre-run:
|
||||
- playbooks/unreachable.yaml
|
||||
run: playbooks/run.yaml
|
||||
|
||||
- job:
|
||||
name: run-unreachable
|
||||
attempts: 2
|
||||
run: playbooks/unreachable.yaml
|
||||
|
||||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- pre-unreachable
|
||||
- run-unreachable
|
|
@ -0,0 +1,6 @@
|
|||
- tenant:
|
||||
name: tenant-one
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- org/project
|
|
@ -4179,6 +4179,40 @@ class TestNoLog(AnsibleZuulTestCase):
|
|||
self.assertNotIn('my-very-secret-password-2', text_log)
|
||||
|
||||
|
||||
class TestUnreachable(AnsibleZuulTestCase):
|
||||
tenant_config_file = 'config/ansible-unreachable/main.yaml'
|
||||
|
||||
def _get_file(self, build, path):
|
||||
p = os.path.join(build.jobdir.root, path)
|
||||
with open(p) as f:
|
||||
return f.read()
|
||||
|
||||
def test_unreachable(self):
|
||||
self.wait_timeout = 120
|
||||
|
||||
# Output extra ansible info so we might see errors.
|
||||
self.executor_server.verbose = True
|
||||
self.executor_server.keep_jobdir = True
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
|
||||
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
# The result must be retry limit because jobs with unreachable nodes
|
||||
# will be retried.
|
||||
self.assertIn('RETRY_LIMIT', A.messages[0])
|
||||
self.assertHistory([
|
||||
dict(name='pre-unreachable', result=None, changes='1,1'),
|
||||
dict(name='pre-unreachable', result=None, changes='1,1'),
|
||||
dict(name='run-unreachable', result=None, changes='1,1'),
|
||||
dict(name='run-unreachable', result=None, changes='1,1'),
|
||||
], ordered=False)
|
||||
unreachable_log = self._get_file(self.history[0],
|
||||
'.ansible/nodes.unreachable')
|
||||
self.assertEqual('fake\n', unreachable_log)
|
||||
|
||||
|
||||
class TestJobPause(AnsibleZuulTestCase):
|
||||
tenant_config_file = 'config/job-pause/main.yaml'
|
||||
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
# Copyright 2018 BMW Carit GmbH
|
||||
#
|
||||
# Zuul is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Zuul is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# This is not needed in python3 - but it is needed in python2 because there
|
||||
# is a json module in ansible.plugins.callback and python2 gets confused.
|
||||
# Easy local testing with ansible-playbook is handy when hacking on zuul_stream
|
||||
# so just put in the __future__ statement.
|
||||
from __future__ import absolute_import
|
||||
|
||||
import os
|
||||
|
||||
from ansible.plugins.callback import default
|
||||
|
||||
|
||||
class CallbackModule(default.CallbackModule):
|
||||
|
||||
CALLBACK_VERSION = 2.0
|
||||
# aggregate means we can be loaded and not be the stdout plugin
|
||||
CALLBACK_TYPE = 'aggregate'
|
||||
CALLBACK_NAME = 'zuul_unreachable'
|
||||
|
||||
def __init__(self):
|
||||
super(CallbackModule, self).__init__()
|
||||
self.output_path = os.path.join(
|
||||
os.environ['ZUUL_JOBDIR'], '.ansible', 'nodes.unreachable')
|
||||
self.unreachable_hosts = set()
|
||||
|
||||
def v2_runner_on_unreachable(self, result):
|
||||
host = result._host.get_name()
|
||||
if host not in self.unreachable_hosts:
|
||||
self.unreachable_hosts.add(host)
|
||||
with open(self.output_path, 'a') as f:
|
||||
f.write('%s\n' % host)
|
|
@ -370,6 +370,8 @@ class JobDir(object):
|
|||
self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache')
|
||||
os.makedirs(self.fact_cache)
|
||||
self.control_path = os.path.join(self.ansible_cache_root, 'cp')
|
||||
self.job_unreachable_file = os.path.join(self.ansible_cache_root,
|
||||
'nodes.unreachable')
|
||||
os.makedirs(self.control_path)
|
||||
localhost_facts = os.path.join(self.fact_cache, 'localhost')
|
||||
# NOTE(pabelanger): We do not want to leak zuul-executor facts to other
|
||||
|
@ -1775,7 +1777,12 @@ class AnsibleJob(object):
|
|||
|
||||
if timeout and watchdog.timed_out:
|
||||
return (self.RESULT_TIMED_OUT, None)
|
||||
if ret == 3:
|
||||
# Note: Unlike documented ansible currently wrongly returns 4 on
|
||||
# unreachable so we have the zuul_unreachable callback module that
|
||||
# creates the file job-output.unreachable in case there were
|
||||
# unreachable nodes. This can be removed once ansible returns a
|
||||
# distinct value for unreachable.
|
||||
if ret == 3 or os.path.exists(self.jobdir.job_unreachable_file):
|
||||
# AnsibleHostUnreachable: We had a network issue connecting to
|
||||
# our zuul-worker.
|
||||
return (self.RESULT_UNREACHABLE, None)
|
||||
|
|
Loading…
Reference in New Issue