Merge "Fix unreachable nodes detection"

This commit is contained in:
Zuul 2018-11-05 12:34:24 +00:00 committed by Gerrit Code Review
commit a477550715
10 changed files with 170 additions and 1 deletions

View File

@ -5,6 +5,7 @@
command: ansible-playbook src/git.openstack.org/openstack-infra/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml
environment:
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
- name: Run ansible playbook that should fail
@ -13,6 +14,7 @@
failed_when: "failed_results.rc != 2"
environment:
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
- name: Validate output - setupvar

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Jobs that encountered unreachable nodes are now correctly detected and
retried.

View File

@ -18,6 +18,7 @@
- job:
name: no-log-unreachable
attempts: 1
run: playbooks/no-log-unreachable.yaml
- project:

View File

@ -0,0 +1,6 @@
- hosts: localhost
gather_facts: no
tasks:
- name: Test
debug:
msg: Test

View File

@ -0,0 +1,28 @@
- hosts: localhost
gather_facts: no
tasks:
- name: Add a fake host
add_host:
hostname: fake
ansible_host: notexisting.example.notexisting
- hosts: fake
gather_facts: no
tasks:
- name: Run a lineinfile task
vars:
logins:
- machine: foo
login: bar
password: my-very-secret-password-1
- machine: two
login: three
password: my-very-secret-password-2
lineinfile:
path: /tmp/.netrc
mode: 0600
create: true
insertafter: EOF
line: "machine {{ item.machine }} login {{ item.login }} password {{ item.password }}"
with_items: "{{ logins }}"
no_log: true

View File

@ -0,0 +1,35 @@
- pipeline:
name: check
manager: independent
post-review: true
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- job:
name: base
parent: null
- job:
name: pre-unreachable
attempts: 2
pre-run:
- playbooks/unreachable.yaml
run: playbooks/run.yaml
- job:
name: run-unreachable
attempts: 2
run: playbooks/unreachable.yaml
- project:
check:
jobs:
- pre-unreachable
- run-unreachable

View File

@ -0,0 +1,6 @@
- tenant:
name: tenant-one
source:
gerrit:
config-projects:
- org/project

View File

@ -4179,6 +4179,40 @@ class TestNoLog(AnsibleZuulTestCase):
self.assertNotIn('my-very-secret-password-2', text_log)
class TestUnreachable(AnsibleZuulTestCase):
tenant_config_file = 'config/ansible-unreachable/main.yaml'
def _get_file(self, build, path):
p = os.path.join(build.jobdir.root, path)
with open(p) as f:
return f.read()
def test_unreachable(self):
self.wait_timeout = 120
# Output extra ansible info so we might see errors.
self.executor_server.verbose = True
self.executor_server.keep_jobdir = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# The result must be retry limit because jobs with unreachable nodes
# will be retried.
self.assertIn('RETRY_LIMIT', A.messages[0])
self.assertHistory([
dict(name='pre-unreachable', result=None, changes='1,1'),
dict(name='pre-unreachable', result=None, changes='1,1'),
dict(name='run-unreachable', result=None, changes='1,1'),
dict(name='run-unreachable', result=None, changes='1,1'),
], ordered=False)
unreachable_log = self._get_file(self.history[0],
'.ansible/nodes.unreachable')
self.assertEqual('fake\n', unreachable_log)
class TestJobPause(AnsibleZuulTestCase):
tenant_config_file = 'config/job-pause/main.yaml'

View File

@ -0,0 +1,45 @@
# Copyright 2018 BMW Carit GmbH
#
# Zuul is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Zuul is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
# This is not needed in python3 - but it is needed in python2 because there
# is a json module in ansible.plugins.callback and python2 gets confused.
# Easy local testing with ansible-playbook is handy when hacking on zuul_stream
# so just put in the __future__ statement.
from __future__ import absolute_import
import os
from ansible.plugins.callback import default
class CallbackModule(default.CallbackModule):
CALLBACK_VERSION = 2.0
# aggregate means we can be loaded and not be the stdout plugin
CALLBACK_TYPE = 'aggregate'
CALLBACK_NAME = 'zuul_unreachable'
def __init__(self):
super(CallbackModule, self).__init__()
self.output_path = os.path.join(
os.environ['ZUUL_JOBDIR'], '.ansible', 'nodes.unreachable')
self.unreachable_hosts = set()
def v2_runner_on_unreachable(self, result):
host = result._host.get_name()
if host not in self.unreachable_hosts:
self.unreachable_hosts.add(host)
with open(self.output_path, 'a') as f:
f.write('%s\n' % host)

View File

@ -370,6 +370,8 @@ class JobDir(object):
self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache')
os.makedirs(self.fact_cache)
self.control_path = os.path.join(self.ansible_cache_root, 'cp')
self.job_unreachable_file = os.path.join(self.ansible_cache_root,
'nodes.unreachable')
os.makedirs(self.control_path)
localhost_facts = os.path.join(self.fact_cache, 'localhost')
# NOTE(pabelanger): We do not want to leak zuul-executor facts to other
@ -1775,7 +1777,12 @@ class AnsibleJob(object):
if timeout and watchdog.timed_out:
return (self.RESULT_TIMED_OUT, None)
if ret == 3:
# Note: Unlike documented ansible currently wrongly returns 4 on
# unreachable so we have the zuul_unreachable callback module that
# creates the file job-output.unreachable in case there were
# unreachable nodes. This can be removed once ansible returns a
# distinct value for unreachable.
if ret == 3 or os.path.exists(self.jobdir.job_unreachable_file):
# AnsibleHostUnreachable: We had a network issue connecting to
# our zuul-worker.
return (self.RESULT_UNREACHABLE, None)