summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Shrewsbury <shrewsbury.dave@gmail.com>2019-01-10 14:45:21 -0500
committerDavid Shrewsbury <shrewsbury.dave@gmail.com>2019-01-23 16:27:19 -0500
commit7225354ec04f0466d95b6d68e51002723d6529c9 (patch)
treea845ad9bb5bcec681bcc29b332174d13819b8d9b
parent7fd6411dac109384a47b672b2f97313980cee687 (diff)
Add a timeout for the image build
A builder thread can wedge if the build process wedges. Add a timeout to the subprocess. Since it was the call to readline() that would block, we change the process to have DIB write directly to the log. This allows us to set a timeout in the Popen.wait() call. And we kill the dib subprocess, as well. The timeout value can be controlled in the diskimage configuration and defaults to 8 hours. Change-Id: I188e8a74dc39b55a4b50ade5c1a96832fea76a7d
Notes
Notes (review): Code-Review+2: James E. Blair <corvus@inaugust.com> Code-Review+2: Ian Wienand <iwienand@redhat.com> Workflow+1: James E. Blair <corvus@inaugust.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Thu, 24 Jan 2019 16:15:32 +0000 Reviewed-on: https://review.openstack.org/629923 Project: openstack-infra/nodepool Branch: refs/heads/master
-rw-r--r--doc/source/configuration.rst6
-rwxr-xr-xnodepool/builder.py45
-rw-r--r--nodepool/cmd/config_validator.py1
-rwxr-xr-xnodepool/config.py5
-rw-r--r--nodepool/tests/__init__.py19
-rwxr-xr-xnodepool/tests/fake-image-create78
-rw-r--r--nodepool/tests/fixtures/config_validate/good.yaml1
-rw-r--r--nodepool/tests/fixtures/diskimage_build_timeout.yaml26
-rw-r--r--nodepool/tests/unit/test_builder.py9
-rw-r--r--releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml6
10 files changed, 147 insertions, 49 deletions
diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst
index 27abacf..7bec670 100644
--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@@ -277,6 +277,12 @@ Options
277 Specifies the distro to be used as a base image to build the image using 277 Specifies the distro to be used as a base image to build the image using
278 diskimage-builder. 278 diskimage-builder.
279 279
280 .. attr:: build-timeout
281 :type: int
282
283 How long (in seconds) to wait for the diskimage build before giving up.
284 The default is 8 hours.
285
280 .. attr:: elements 286 .. attr:: elements
281 :type: list 287 :type: list
282 288
diff --git a/nodepool/builder.py b/nodepool/builder.py
index dce709e..16fe714 100755
--- a/nodepool/builder.py
+++ b/nodepool/builder.py
@@ -739,21 +739,28 @@ class BuildWorker(BaseWorker):
739 if 'qcow2' in img_types: 739 if 'qcow2' in img_types:
740 qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS 740 qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS
741 741
742 cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' % 742 log_fn = self._getBuildLog(diskimage.name, build_id)
743
744 cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
743 (self.dib_cmd, img_types, qemu_img_options, filename, 745 (self.dib_cmd, img_types, qemu_img_options, filename,
744 img_elements)) 746 log_fn, img_elements))
745 747
746 self._pruneBuildLogs(diskimage.name) 748 self._pruneBuildLogs(diskimage.name)
747 log_fn = self._getBuildLog(diskimage.name, build_id)
748 749
749 self.log.info('Running %s' % (cmd,)) 750 self.log.info('Running %s' % (cmd,))
750 self.log.info('Logging to %s' % (log_fn,)) 751 self.log.info('Logging to %s' % (log_fn,))
751 752
752 start_time = time.monotonic() 753 start_time = time.monotonic()
754
755 # We used to use readline() on stdout to output the lines to the
756 # build log. Unfortunately, this would block as long as the process
757 # ran (with no easy way to timeout the read) and wedge the builder.
758 # Now we use --logfile option to the dib command and set a timeout
759 # on the wait() call to prevent the wedge.
760 did_timeout = False
753 try: 761 try:
754 p = subprocess.Popen( 762 p = subprocess.Popen(
755 shlex.split(cmd), 763 shlex.split(cmd),
756 stdout=subprocess.PIPE,
757 stderr=subprocess.STDOUT, 764 stderr=subprocess.STDOUT,
758 env=env) 765 env=env)
759 except OSError as e: 766 except OSError as e:
@@ -761,17 +768,20 @@ class BuildWorker(BaseWorker):
761 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) 768 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
762 ) 769 )
763 770
764 with open(log_fn, 'wb') as log: 771 try:
765 while True: 772 rc = p.wait(timeout=diskimage.build_timeout)
766 ln = p.stdout.readline() 773 except subprocess.TimeoutExpired:
767 log.write(ln) 774 p.kill()
768 log.flush() 775 did_timeout = True
769 if not ln: 776 rc = 1
770 break 777 self.log.error(
771 778 "Build timeout for image %s, build %s (log: %s)",
772 rc = p.wait() 779 diskimage.name, build_id, log_fn)
773 m = "Exit code: %s\n" % rc 780 else:
774 log.write(m.encode('utf8')) 781 # Append return code to dib's log file
782 with open(log_fn, 'ab') as log:
783 m = "Exit code: %s\n" % rc
784 log.write(m.encode('utf8'))
775 785
776 # It's possible the connection to the ZK cluster could have been 786 # It's possible the connection to the ZK cluster could have been
777 # interrupted during the build. If so, wait for it to return. 787 # interrupted during the build. If so, wait for it to return.
@@ -796,9 +806,10 @@ class BuildWorker(BaseWorker):
796 self.log.info("ZooKeeper lost while building %s" % diskimage.name) 806 self.log.info("ZooKeeper lost while building %s" % diskimage.name)
797 self._zk.resetLostFlag() 807 self._zk.resetLostFlag()
798 build_data.state = zk.FAILED 808 build_data.state = zk.FAILED
799 elif p.returncode: 809 elif p.returncode or did_timeout:
800 self.log.info( 810 self.log.info(
801 "DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) 811 "DIB failed creating %s (%s) (timeout=%s)" % (
812 diskimage.name, p.returncode, did_timeout))
802 build_data.state = zk.FAILED 813 build_data.state = zk.FAILED
803 else: 814 else:
804 self.log.info("DIB image %s is built" % diskimage.name) 815 self.log.info("DIB image %s is built" % diskimage.name)
diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py
index c3dde57..f2646af 100644
--- a/nodepool/cmd/config_validator.py
+++ b/nodepool/cmd/config_validator.py
@@ -44,6 +44,7 @@ class ConfigValidator:
44 'rebuild-age': int, 44 'rebuild-age': int,
45 'env-vars': {str: str}, 45 'env-vars': {str: str},
46 'username': str, 46 'username': str,
47 'build-timeout': int,
47 } 48 }
48 49
49 webapp = { 50 webapp = {
diff --git a/nodepool/config.py b/nodepool/config.py
index 2b5e4eb..7f09961 100755
--- a/nodepool/config.py
+++ b/nodepool/config.py
@@ -118,6 +118,7 @@ class Config(ConfigValue):
118 d.image_types = set(diskimage.get('formats', [])) 118 d.image_types = set(diskimage.get('formats', []))
119 d.pause = bool(diskimage.get('pause', False)) 119 d.pause = bool(diskimage.get('pause', False))
120 d.username = diskimage.get('username', 'zuul') 120 d.username = diskimage.get('username', 'zuul')
121 d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
121 self.diskimages[d.name] = d 122 self.diskimages[d.name] = d
122 123
123 def setSecureDiskimageEnv(self, diskimages, secure_config_path): 124 def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
179 self.image_types = None 180 self.image_types = None
180 self.pause = False 181 self.pause = False
181 self.username = None 182 self.username = None
183 self.build_timeout = None
182 184
183 def __eq__(self, other): 185 def __eq__(self, other):
184 if isinstance(other, DiskImage): 186 if isinstance(other, DiskImage):
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
189 other.env_vars == self.env_vars and 191 other.env_vars == self.env_vars and
190 other.image_types == self.image_types and 192 other.image_types == self.image_types and
191 other.pause == self.pause and 193 other.pause == self.pause and
192 other.username == self.username) 194 other.username == self.username and
195 other.build_timeout == self.build_timeout)
193 return False 196 return False
194 197
195 def __repr__(self): 198 def __repr__(self):
diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py
index 4a8e83d..376b4fb 100644
--- a/nodepool/tests/__init__.py
+++ b/nodepool/tests/__init__.py
@@ -410,20 +410,25 @@ class DBTestCase(BaseTestCase):
410 time.sleep(1) 410 time.sleep(1)
411 self.wait_for_threads() 411 self.wait_for_threads()
412 412
413 def waitForBuild(self, image_name, build_id): 413 def waitForBuild(self, image_name, build_id, states=None):
414 if states is None:
415 states = (zk.READY,)
416
414 base = "-".join([image_name, build_id]) 417 base = "-".join([image_name, build_id])
418
415 while True: 419 while True:
416 self.wait_for_threads() 420 self.wait_for_threads()
417 files = builder.DibImageFile.from_image_id( 421 build = self.zk.getBuild(image_name, build_id)
418 self._config_images_dir.path, base) 422 if build and build.state in states:
419 if files:
420 break 423 break
421 time.sleep(1) 424 time.sleep(1)
422 425
423 while True: 426 # We should only expect a dib manifest with a successful build.
427 while build.state == zk.READY:
424 self.wait_for_threads() 428 self.wait_for_threads()
425 build = self.zk.getBuild(image_name, build_id) 429 files = builder.DibImageFile.from_image_id(
426 if build and build.state == zk.READY: 430 self._config_images_dir.path, base)
431 if files:
427 break 432 break
428 time.sleep(1) 433 time.sleep(1)
429 434
diff --git a/nodepool/tests/fake-image-create b/nodepool/tests/fake-image-create
index 9d236ff..1a2406f 100755
--- a/nodepool/tests/fake-image-create
+++ b/nodepool/tests/fake-image-create
@@ -1,10 +1,49 @@
1#!/bin/bash 1#!/bin/bash
2 2
3outfile=
4outtypes=("qcow2")
5
6all_args=$*
7logfile=
8checksum=
9no_tmpfs=
10qemu_img_options=
11x=
12
13TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
14if [ $? -ne 0 ]; then
15 echo "Invalid option"
16 exit 1
17fi
18eval set -- "$TEMP"
19while true ; do
20 case "$1" in
21 --checksum)
22 checksum=1; shift 1;;
23 --no-tmpfs)
24 no_tmpfs=1; shift 1;;
25 --qemu-img-options)
26 qemu_img_options=$2; shift 2;;
27 --logfile)
28 logfile=$2; shift 2;;
29 -o) outfile=$2; shift 2;;
30 -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
31 -x) x=1; shift;;
32 --) shift ; break ;;
33 *) echo "Unknown option : $1"; exit 1;;
34 esac
35done
36
37# If --logfile was given, direct stdout to it, as well
38if [ ! -z "$logfile" ]; then
39 exec > >(tee -a ${logfile})
40fi
41
3echo "*** fake-image-create: start" 42echo "*** fake-image-create: start"
4 43
5echo "arguments:" 44echo "arguments:"
6echo "----" 45echo "----"
7echo $* 46echo "$all_args"
8echo "----" 47echo "----"
9 48
10if [[ "${SHOULD_FAIL}" == 'true' ]]; then 49if [[ "${SHOULD_FAIL}" == 'true' ]]; then
@@ -38,30 +77,21 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
38 exit 1 77 exit 1
39fi 78fi
40 79
41outfile= 80if [ ! -z "$logfile" ]; then
42outtypes=("qcow2") 81 echo " -> logfile: $logfile"
43 82fi
44TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@") 83if [ ! -z "$checksum" ]; then
45if [ $? -ne 0 ]; then 84 echo " -> set --checksum"
46 echo "Invalid option" 85fi
47 exit 1 86if [ ! -z "$no_tmpfs" ]; then
87 echo " -> set --no-tmpfs"
88fi
89if [ ! -z "$qemu_img_options" ]; then
90 echo " -> qemu-img-options: $qemu_img_options"
91fi
92if [ ! -z "$x" ]; then
93 echo " -> debugging enabled"
48fi 94fi
49eval set -- "$TEMP"
50while true ; do
51 case "$1" in
52 --checksum)
53 echo " -> set --checksum"; shift 1;;
54 --no-tmpfs)
55 echo " -> set --no-tmpfs"; shift 1;;
56 --qemu-img-options)
57 echo " -> qemu-img-options: $2"; shift 2;;
58 -o) outfile=$2; shift 2;;
59 -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
60 -x) echo " -> debugging enabled"; shift;;
61 --) shift ; break ;;
62 *) echo "Unknown option : $1"; exit 1;;
63 esac
64done
65 95
66if [ -z "$outfile" ]; then 96if [ -z "$outfile" ]; then
67 echo "No output file specified." 97 echo "No output file specified."
diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml
index db7d1ec..fc59937 100644
--- a/nodepool/tests/fixtures/config_validate/good.yaml
+++ b/nodepool/tests/fixtures/config_validate/good.yaml
@@ -130,6 +130,7 @@ diskimages:
130 - cache-devstack 130 - cache-devstack
131 release: trusty 131 release: trusty
132 rebuild-age: 3600 132 rebuild-age: 3600
133 build-timeout: 3600
133 env-vars: 134 env-vars:
134 TMPDIR: /opt/dib_tmp 135 TMPDIR: /opt/dib_tmp
135 DIB_IMAGE_CACHE: /opt/dib_cache 136 DIB_IMAGE_CACHE: /opt/dib_cache
diff --git a/nodepool/tests/fixtures/diskimage_build_timeout.yaml b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
new file mode 100644
index 0000000..01770bd
--- /dev/null
+++ b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
@@ -0,0 +1,26 @@
1elements-dir: .
2images-dir: '{images_dir}'
3build-log-dir: '{build_log_dir}'
4
5zookeeper-servers:
6 - host: {zookeeper_host}
7 port: {zookeeper_port}
8 chroot: {zookeeper_chroot}
9
10labels: []
11
12providers: []
13
14diskimages:
15 - name: fake-image
16 formats:
17 - tar
18 elements:
19 - fedora
20 - vm
21 release: 21
22 env-vars:
23 TMPDIR: /opt/dib_tmp
24 DIB_IMAGE_CACHE: /opt/dib_cache
25 DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
26 BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py
index c992bf7..cfea4b8 100644
--- a/nodepool/tests/unit/test_builder.py
+++ b/nodepool/tests/unit/test_builder.py
@@ -16,6 +16,8 @@
16import os 16import os
17import uuid 17import uuid
18import fixtures 18import fixtures
19import mock
20import subprocess
19 21
20from nodepool import builder, exceptions, tests 22from nodepool import builder, exceptions, tests
21from nodepool.driver.fake import provider as fakeprovider 23from nodepool.driver.fake import provider as fakeprovider
@@ -335,3 +337,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
335 337
336 self.assertEqual(build_default._formats, ['qcow2']) 338 self.assertEqual(build_default._formats, ['qcow2'])
337 self.assertEqual(build_vhd._formats, ['vhd']) 339 self.assertEqual(build_vhd._formats, ['vhd'])
340
341 @mock.patch.object(subprocess.Popen, 'wait')
342 def test_diskimage_build_timeout(self, mock_wait):
343 mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
344 configfile = self.setup_config('diskimage_build_timeout.yaml')
345 self.useBuilder(configfile, cleanup_interval=0)
346 self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
diff --git a/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
new file mode 100644
index 0000000..6901903
--- /dev/null
+++ b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
@@ -0,0 +1,6 @@
1---
2features:
3 - |
4 A new option (build-timeout) has been added to the builder diskimage
5 configuration to control how long the builder should wait for image
6 builds before giving up. The default is 8 hours.