Add a timeout for the image build

A builder thread can wedge if the build process wedges. Add a timeout
to the subprocess. Since it was the call to readline() that would block,
we change the process to have DIB write directly to the log. This allows
us to set a timeout in the Popen.wait() call. And we kill the dib
subprocess, as well.

The timeout value can be controlled in the diskimage configuration and
defaults to 8 hours.

Change-Id: I188e8a74dc39b55a4b50ade5c1a96832fea76a7d
This commit is contained in:
David Shrewsbury 2019-01-10 14:45:21 -05:00
parent 7fd6411dac
commit 7225354ec0
10 changed files with 149 additions and 51 deletions

View File

@ -277,6 +277,12 @@ Options
Specifies the distro to be used as a base image to build the image using
diskimage-builder.
.. attr:: build-timeout
:type: int
How long (in seconds) to wait for the diskimage build before giving up.
The default is 8 hours.
.. attr:: elements
:type: list

View File

@ -739,21 +739,28 @@ class BuildWorker(BaseWorker):
if 'qcow2' in img_types:
qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS
cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' %
log_fn = self._getBuildLog(diskimage.name, build_id)
cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
(self.dib_cmd, img_types, qemu_img_options, filename,
img_elements))
log_fn, img_elements))
self._pruneBuildLogs(diskimage.name)
log_fn = self._getBuildLog(diskimage.name, build_id)
self.log.info('Running %s' % (cmd,))
self.log.info('Logging to %s' % (log_fn,))
start_time = time.monotonic()
# We used to use readline() on stdout to output the lines to the
# build log. Unfortunately, this would block as long as the process
# ran (with no easy way to timeout the read) and wedge the builder.
# Now we use --logfile option to the dib command and set a timeout
# on the wait() call to prevent the wedge.
did_timeout = False
try:
p = subprocess.Popen(
shlex.split(cmd),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
env=env)
except OSError as e:
@ -761,17 +768,20 @@ class BuildWorker(BaseWorker):
"Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
)
with open(log_fn, 'wb') as log:
while True:
ln = p.stdout.readline()
log.write(ln)
log.flush()
if not ln:
break
rc = p.wait()
m = "Exit code: %s\n" % rc
log.write(m.encode('utf8'))
try:
rc = p.wait(timeout=diskimage.build_timeout)
except subprocess.TimeoutExpired:
p.kill()
did_timeout = True
rc = 1
self.log.error(
"Build timeout for image %s, build %s (log: %s)",
diskimage.name, build_id, log_fn)
else:
# Append return code to dib's log file
with open(log_fn, 'ab') as log:
m = "Exit code: %s\n" % rc
log.write(m.encode('utf8'))
# It's possible the connection to the ZK cluster could have been
# interrupted during the build. If so, wait for it to return.
@ -796,9 +806,10 @@ class BuildWorker(BaseWorker):
self.log.info("ZooKeeper lost while building %s" % diskimage.name)
self._zk.resetLostFlag()
build_data.state = zk.FAILED
elif p.returncode:
elif p.returncode or did_timeout:
self.log.info(
"DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
"DIB failed creating %s (%s) (timeout=%s)" % (
diskimage.name, p.returncode, did_timeout))
build_data.state = zk.FAILED
else:
self.log.info("DIB image %s is built" % diskimage.name)

View File

@ -44,6 +44,7 @@ class ConfigValidator:
'rebuild-age': int,
'env-vars': {str: str},
'username': str,
'build-timeout': int,
}
webapp = {

View File

@ -118,6 +118,7 @@ class Config(ConfigValue):
d.image_types = set(diskimage.get('formats', []))
d.pause = bool(diskimage.get('pause', False))
d.username = diskimage.get('username', 'zuul')
d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
self.diskimages[d.name] = d
def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
self.image_types = None
self.pause = False
self.username = None
self.build_timeout = None
def __eq__(self, other):
if isinstance(other, DiskImage):
@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
other.env_vars == self.env_vars and
other.image_types == self.image_types and
other.pause == self.pause and
other.username == self.username)
other.username == self.username and
other.build_timeout == self.build_timeout)
return False
def __repr__(self):

View File

@ -410,20 +410,25 @@ class DBTestCase(BaseTestCase):
time.sleep(1)
self.wait_for_threads()
def waitForBuild(self, image_name, build_id):
def waitForBuild(self, image_name, build_id, states=None):
if states is None:
states = (zk.READY,)
base = "-".join([image_name, build_id])
while True:
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)
if files:
break
time.sleep(1)
while True:
self.wait_for_threads()
build = self.zk.getBuild(image_name, build_id)
if build and build.state == zk.READY:
if build and build.state in states:
break
time.sleep(1)
# We should only expect a dib manifest with a successful build.
while build.state == zk.READY:
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)
if files:
break
time.sleep(1)

View File

@ -1,10 +1,49 @@
#!/bin/bash
outfile=
outtypes=("qcow2")
all_args=$*
logfile=
checksum=
no_tmpfs=
qemu_img_options=
x=
TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
if [ $? -ne 0 ]; then
echo "Invalid option"
exit 1
fi
eval set -- "$TEMP"
while true ; do
case "$1" in
--checksum)
checksum=1; shift 1;;
--no-tmpfs)
no_tmpfs=1; shift 1;;
--qemu-img-options)
qemu_img_options=$2; shift 2;;
--logfile)
logfile=$2; shift 2;;
-o) outfile=$2; shift 2;;
-t) IFS="," read -a outtypes <<< "$2"; shift 2;;
-x) x=1; shift;;
--) shift ; break ;;
*) echo "Unknown option : $1"; exit 1;;
esac
done
# If --logfile was given, direct stdout to it, as well
if [ ! -z "$logfile" ]; then
exec > >(tee -a ${logfile})
fi
echo "*** fake-image-create: start"
echo "arguments:"
echo "----"
echo $*
echo "$all_args"
echo "----"
if [[ "${SHOULD_FAIL}" == 'true' ]]; then
@ -38,30 +77,21 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
exit 1
fi
outfile=
outtypes=("qcow2")
TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@")
if [ $? -ne 0 ]; then
echo "Invalid option"
exit 1
if [ ! -z "$logfile" ]; then
echo " -> logfile: $logfile"
fi
if [ ! -z "$checksum" ]; then
echo " -> set --checksum"
fi
if [ ! -z "$no_tmpfs" ]; then
echo " -> set --no-tmpfs"
fi
if [ ! -z "$qemu_img_options" ]; then
echo " -> qemu-img-options: $qemu_img_options"
fi
if [ ! -z "$x" ]; then
echo " -> debugging enabled"
fi
eval set -- "$TEMP"
while true ; do
case "$1" in
--checksum)
echo " -> set --checksum"; shift 1;;
--no-tmpfs)
echo " -> set --no-tmpfs"; shift 1;;
--qemu-img-options)
echo " -> qemu-img-options: $2"; shift 2;;
-o) outfile=$2; shift 2;;
-t) IFS="," read -a outtypes <<< "$2"; shift 2;;
-x) echo " -> debugging enabled"; shift;;
--) shift ; break ;;
*) echo "Unknown option : $1"; exit 1;;
esac
done
if [ -z "$outfile" ]; then
echo "No output file specified."

View File

@ -130,6 +130,7 @@ diskimages:
- cache-devstack
release: trusty
rebuild-age: 3600
build-timeout: 3600
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache

View File

@ -0,0 +1,26 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels: []
providers: []
diskimages:
- name: fake-image
formats:
- tar
elements:
- fedora
- vm
release: 21
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -16,6 +16,8 @@
import os
import uuid
import fixtures
import mock
import subprocess
from nodepool import builder, exceptions, tests
from nodepool.driver.fake import provider as fakeprovider
@ -335,3 +337,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
self.assertEqual(build_default._formats, ['qcow2'])
self.assertEqual(build_vhd._formats, ['vhd'])
@mock.patch.object(subprocess.Popen, 'wait')
def test_diskimage_build_timeout(self, mock_wait):
mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
configfile = self.setup_config('diskimage_build_timeout.yaml')
self.useBuilder(configfile, cleanup_interval=0)
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))

View File

@ -0,0 +1,6 @@
---
features:
- |
A new option (build-timeout) has been added to the builder diskimage
configuration to control how long the builder should wait for image
builds before giving up. The default is 8 hours.