summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2019-01-24 16:15:32 +0000
committerGerrit Code Review <review@openstack.org>2019-01-24 16:15:32 +0000
commit26c57ee5a90320b4a4651aecbda384ef9c97946c (patch)
tree64e86027b0cabfecd9a32a28ab41e3873f0bf29c
parent44ae87c3105b44e3c21aa6c4077e4792d9bcb2a0 (diff)
parent7225354ec04f0466d95b6d68e51002723d6529c9 (diff)
Merge "Add a timeout for the image build"
-rw-r--r--doc/source/configuration.rst6
-rwxr-xr-xnodepool/builder.py45
-rw-r--r--nodepool/cmd/config_validator.py1
-rwxr-xr-xnodepool/config.py5
-rw-r--r--nodepool/tests/__init__.py19
-rwxr-xr-xnodepool/tests/fake-image-create78
-rw-r--r--nodepool/tests/fixtures/config_validate/good.yaml1
-rw-r--r--nodepool/tests/fixtures/diskimage_build_timeout.yaml26
-rw-r--r--nodepool/tests/unit/test_builder.py9
-rw-r--r--releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml6
10 files changed, 147 insertions, 49 deletions
diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst
index 34dcdd8..ba442c0 100644
--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@@ -277,6 +277,12 @@ Options
277 Specifies the distro to be used as a base image to build the image using 277 Specifies the distro to be used as a base image to build the image using
278 diskimage-builder. 278 diskimage-builder.
279 279
280 .. attr:: build-timeout
281 :type: int
282
283 How long (in seconds) to wait for the diskimage build before giving up.
284 The default is 8 hours.
285
280 .. attr:: elements 286 .. attr:: elements
281 :type: list 287 :type: list
282 288
diff --git a/nodepool/builder.py b/nodepool/builder.py
index dce709e..16fe714 100755
--- a/nodepool/builder.py
+++ b/nodepool/builder.py
@@ -739,21 +739,28 @@ class BuildWorker(BaseWorker):
739 if 'qcow2' in img_types: 739 if 'qcow2' in img_types:
740 qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS 740 qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS
741 741
742 cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' % 742 log_fn = self._getBuildLog(diskimage.name, build_id)
743
744 cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
743 (self.dib_cmd, img_types, qemu_img_options, filename, 745 (self.dib_cmd, img_types, qemu_img_options, filename,
744 img_elements)) 746 log_fn, img_elements))
745 747
746 self._pruneBuildLogs(diskimage.name) 748 self._pruneBuildLogs(diskimage.name)
747 log_fn = self._getBuildLog(diskimage.name, build_id)
748 749
749 self.log.info('Running %s' % (cmd,)) 750 self.log.info('Running %s' % (cmd,))
750 self.log.info('Logging to %s' % (log_fn,)) 751 self.log.info('Logging to %s' % (log_fn,))
751 752
752 start_time = time.monotonic() 753 start_time = time.monotonic()
754
755 # We used to use readline() on stdout to output the lines to the
756 # build log. Unfortunately, this would block as long as the process
757 # ran (with no easy way to timeout the read) and wedge the builder.
758 # Now we use --logfile option to the dib command and set a timeout
759 # on the wait() call to prevent the wedge.
760 did_timeout = False
753 try: 761 try:
754 p = subprocess.Popen( 762 p = subprocess.Popen(
755 shlex.split(cmd), 763 shlex.split(cmd),
756 stdout=subprocess.PIPE,
757 stderr=subprocess.STDOUT, 764 stderr=subprocess.STDOUT,
758 env=env) 765 env=env)
759 except OSError as e: 766 except OSError as e:
@@ -761,17 +768,20 @@ class BuildWorker(BaseWorker):
761 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) 768 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
762 ) 769 )
763 770
764 with open(log_fn, 'wb') as log: 771 try:
765 while True: 772 rc = p.wait(timeout=diskimage.build_timeout)
766 ln = p.stdout.readline() 773 except subprocess.TimeoutExpired:
767 log.write(ln) 774 p.kill()
768 log.flush() 775 did_timeout = True
769 if not ln: 776 rc = 1
770 break 777 self.log.error(
771 778 "Build timeout for image %s, build %s (log: %s)",
772 rc = p.wait() 779 diskimage.name, build_id, log_fn)
773 m = "Exit code: %s\n" % rc 780 else:
774 log.write(m.encode('utf8')) 781 # Append return code to dib's log file
782 with open(log_fn, 'ab') as log:
783 m = "Exit code: %s\n" % rc
784 log.write(m.encode('utf8'))
775 785
776 # It's possible the connection to the ZK cluster could have been 786 # It's possible the connection to the ZK cluster could have been
777 # interrupted during the build. If so, wait for it to return. 787 # interrupted during the build. If so, wait for it to return.
@@ -796,9 +806,10 @@ class BuildWorker(BaseWorker):
796 self.log.info("ZooKeeper lost while building %s" % diskimage.name) 806 self.log.info("ZooKeeper lost while building %s" % diskimage.name)
797 self._zk.resetLostFlag() 807 self._zk.resetLostFlag()
798 build_data.state = zk.FAILED 808 build_data.state = zk.FAILED
799 elif p.returncode: 809 elif p.returncode or did_timeout:
800 self.log.info( 810 self.log.info(
801 "DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) 811 "DIB failed creating %s (%s) (timeout=%s)" % (
812 diskimage.name, p.returncode, did_timeout))
802 build_data.state = zk.FAILED 813 build_data.state = zk.FAILED
803 else: 814 else:
804 self.log.info("DIB image %s is built" % diskimage.name) 815 self.log.info("DIB image %s is built" % diskimage.name)
diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py
index c3dde57..f2646af 100644
--- a/nodepool/cmd/config_validator.py
+++ b/nodepool/cmd/config_validator.py
@@ -44,6 +44,7 @@ class ConfigValidator:
44 'rebuild-age': int, 44 'rebuild-age': int,
45 'env-vars': {str: str}, 45 'env-vars': {str: str},
46 'username': str, 46 'username': str,
47 'build-timeout': int,
47 } 48 }
48 49
49 webapp = { 50 webapp = {
diff --git a/nodepool/config.py b/nodepool/config.py
index 2b5e4eb..7f09961 100755
--- a/nodepool/config.py
+++ b/nodepool/config.py
@@ -118,6 +118,7 @@ class Config(ConfigValue):
118 d.image_types = set(diskimage.get('formats', [])) 118 d.image_types = set(diskimage.get('formats', []))
119 d.pause = bool(diskimage.get('pause', False)) 119 d.pause = bool(diskimage.get('pause', False))
120 d.username = diskimage.get('username', 'zuul') 120 d.username = diskimage.get('username', 'zuul')
121 d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
121 self.diskimages[d.name] = d 122 self.diskimages[d.name] = d
122 123
123 def setSecureDiskimageEnv(self, diskimages, secure_config_path): 124 def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
179 self.image_types = None 180 self.image_types = None
180 self.pause = False 181 self.pause = False
181 self.username = None 182 self.username = None
183 self.build_timeout = None
182 184
183 def __eq__(self, other): 185 def __eq__(self, other):
184 if isinstance(other, DiskImage): 186 if isinstance(other, DiskImage):
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
189 other.env_vars == self.env_vars and 191 other.env_vars == self.env_vars and
190 other.image_types == self.image_types and 192 other.image_types == self.image_types and
191 other.pause == self.pause and 193 other.pause == self.pause and
192 other.username == self.username) 194 other.username == self.username and
195 other.build_timeout == self.build_timeout)
193 return False 196 return False
194 197
195 def __repr__(self): 198 def __repr__(self):
diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py
index 6cf67ff..e4c84a7 100644
--- a/nodepool/tests/__init__.py
+++ b/nodepool/tests/__init__.py
@@ -411,20 +411,25 @@ class DBTestCase(BaseTestCase):
411 time.sleep(1) 411 time.sleep(1)
412 self.wait_for_threads() 412 self.wait_for_threads()
413 413
414 def waitForBuild(self, image_name, build_id): 414 def waitForBuild(self, image_name, build_id, states=None):
415 if states is None:
416 states = (zk.READY,)
417
415 base = "-".join([image_name, build_id]) 418 base = "-".join([image_name, build_id])
419
416 while True: 420 while True:
417 self.wait_for_threads() 421 self.wait_for_threads()
418 files = builder.DibImageFile.from_image_id( 422 build = self.zk.getBuild(image_name, build_id)
419 self._config_images_dir.path, base) 423 if build and build.state in states:
420 if files:
421 break 424 break
422 time.sleep(1) 425 time.sleep(1)
423 426
424 while True: 427 # We should only expect a dib manifest with a successful build.
428 while build.state == zk.READY:
425 self.wait_for_threads() 429 self.wait_for_threads()
426 build = self.zk.getBuild(image_name, build_id) 430 files = builder.DibImageFile.from_image_id(
427 if build and build.state == zk.READY: 431 self._config_images_dir.path, base)
432 if files:
428 break 433 break
429 time.sleep(1) 434 time.sleep(1)
430 435
diff --git a/nodepool/tests/fake-image-create b/nodepool/tests/fake-image-create
index 9d236ff..1a2406f 100755
--- a/nodepool/tests/fake-image-create
+++ b/nodepool/tests/fake-image-create
@@ -1,10 +1,49 @@
1#!/bin/bash 1#!/bin/bash
2 2
3outfile=
4outtypes=("qcow2")
5
6all_args=$*
7logfile=
8checksum=
9no_tmpfs=
10qemu_img_options=
11x=
12
13TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
14if [ $? -ne 0 ]; then
15 echo "Invalid option"
16 exit 1
17fi
18eval set -- "$TEMP"
19while true ; do
20 case "$1" in
21 --checksum)
22 checksum=1; shift 1;;
23 --no-tmpfs)
24 no_tmpfs=1; shift 1;;
25 --qemu-img-options)
26 qemu_img_options=$2; shift 2;;
27 --logfile)
28 logfile=$2; shift 2;;
29 -o) outfile=$2; shift 2;;
30 -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
31 -x) x=1; shift;;
32 --) shift ; break ;;
33 *) echo "Unknown option : $1"; exit 1;;
34 esac
35done
36
37# If --logfile was given, direct stdout to it, as well
38if [ ! -z "$logfile" ]; then
39 exec > >(tee -a ${logfile})
40fi
41
3echo "*** fake-image-create: start" 42echo "*** fake-image-create: start"
4 43
5echo "arguments:" 44echo "arguments:"
6echo "----" 45echo "----"
7echo $* 46echo "$all_args"
8echo "----" 47echo "----"
9 48
10if [[ "${SHOULD_FAIL}" == 'true' ]]; then 49if [[ "${SHOULD_FAIL}" == 'true' ]]; then
@@ -38,30 +77,21 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
38 exit 1 77 exit 1
39fi 78fi
40 79
41outfile= 80if [ ! -z "$logfile" ]; then
42outtypes=("qcow2") 81 echo " -> logfile: $logfile"
43 82fi
44TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@") 83if [ ! -z "$checksum" ]; then
45if [ $? -ne 0 ]; then 84 echo " -> set --checksum"
46 echo "Invalid option" 85fi
47 exit 1 86if [ ! -z "$no_tmpfs" ]; then
87 echo " -> set --no-tmpfs"
88fi
89if [ ! -z "$qemu_img_options" ]; then
90 echo " -> qemu-img-options: $qemu_img_options"
91fi
92if [ ! -z "$x" ]; then
93 echo " -> debugging enabled"
48fi 94fi
49eval set -- "$TEMP"
50while true ; do
51 case "$1" in
52 --checksum)
53 echo " -> set --checksum"; shift 1;;
54 --no-tmpfs)
55 echo " -> set --no-tmpfs"; shift 1;;
56 --qemu-img-options)
57 echo " -> qemu-img-options: $2"; shift 2;;
58 -o) outfile=$2; shift 2;;
59 -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
60 -x) echo " -> debugging enabled"; shift;;
61 --) shift ; break ;;
62 *) echo "Unknown option : $1"; exit 1;;
63 esac
64done
65 95
66if [ -z "$outfile" ]; then 96if [ -z "$outfile" ]; then
67 echo "No output file specified." 97 echo "No output file specified."
diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml
index 9aba134..7e81657 100644
--- a/nodepool/tests/fixtures/config_validate/good.yaml
+++ b/nodepool/tests/fixtures/config_validate/good.yaml
@@ -152,6 +152,7 @@ diskimages:
152 - cache-devstack 152 - cache-devstack
153 release: trusty 153 release: trusty
154 rebuild-age: 3600 154 rebuild-age: 3600
155 build-timeout: 3600
155 env-vars: 156 env-vars:
156 TMPDIR: /opt/dib_tmp 157 TMPDIR: /opt/dib_tmp
157 DIB_IMAGE_CACHE: /opt/dib_cache 158 DIB_IMAGE_CACHE: /opt/dib_cache
diff --git a/nodepool/tests/fixtures/diskimage_build_timeout.yaml b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
new file mode 100644
index 0000000..01770bd
--- /dev/null
+++ b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
@@ -0,0 +1,26 @@
1elements-dir: .
2images-dir: '{images_dir}'
3build-log-dir: '{build_log_dir}'
4
5zookeeper-servers:
6 - host: {zookeeper_host}
7 port: {zookeeper_port}
8 chroot: {zookeeper_chroot}
9
10labels: []
11
12providers: []
13
14diskimages:
15 - name: fake-image
16 formats:
17 - tar
18 elements:
19 - fedora
20 - vm
21 release: 21
22 env-vars:
23 TMPDIR: /opt/dib_tmp
24 DIB_IMAGE_CACHE: /opt/dib_cache
25 DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
26 BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py
index c992bf7..cfea4b8 100644
--- a/nodepool/tests/unit/test_builder.py
+++ b/nodepool/tests/unit/test_builder.py
@@ -16,6 +16,8 @@
16import os 16import os
17import uuid 17import uuid
18import fixtures 18import fixtures
19import mock
20import subprocess
19 21
20from nodepool import builder, exceptions, tests 22from nodepool import builder, exceptions, tests
21from nodepool.driver.fake import provider as fakeprovider 23from nodepool.driver.fake import provider as fakeprovider
@@ -335,3 +337,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
335 337
336 self.assertEqual(build_default._formats, ['qcow2']) 338 self.assertEqual(build_default._formats, ['qcow2'])
337 self.assertEqual(build_vhd._formats, ['vhd']) 339 self.assertEqual(build_vhd._formats, ['vhd'])
340
341 @mock.patch.object(subprocess.Popen, 'wait')
342 def test_diskimage_build_timeout(self, mock_wait):
343 mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
344 configfile = self.setup_config('diskimage_build_timeout.yaml')
345 self.useBuilder(configfile, cleanup_interval=0)
346 self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
diff --git a/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
new file mode 100644
index 0000000..6901903
--- /dev/null
+++ b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
@@ -0,0 +1,6 @@
1---
2features:
3 - |
4 A new option (build-timeout) has been added to the builder diskimage
5 configuration to control how long the builder should wait for image
6 builds before giving up. The default is 8 hours.