summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Shrewsbury <dshrewsb@redhat.com>2019-01-29 11:50:08 -0500
committerDavid Shrewsbury <dshrewsb@redhat.com>2019-01-31 11:36:47 -0500
commit890ea4975eaafa147fd7c87dbfcdd179d3e0d7b8 (patch)
tree5edfe4dd6fdb66b1a552494732b659bf01bad1a6
parentacf038aef4c4ebb3fd410a333e6c35b002e76e8a (diff)
Revert "Revert "Add a timeout for the image build""
This reverts commit ccf40a462a4805c7162bc34434cfe718cd96c448. The previous version would not work properly when daemonized because there was no stdout. This version maintains stdout and uses select/poll with non-blocking stdout to capture the output to a log file. Depends-On: https://review.openstack.org/634266 Change-Id: I7f0617b91e071294fe6051d14475ead1d7df56b7
Notes
Notes (review): Code-Review+2: Tobias Henkel <tobias.henkel@bmw.de> Code-Review+2: Ian Wienand <iwienand@redhat.com> Code-Review+2: Monty Taylor <mordred@inaugust.com> Workflow+1: Monty Taylor <mordred@inaugust.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Wed, 06 Feb 2019 13:16:06 +0000 Reviewed-on: https://review.openstack.org/633792 Project: openstack-infra/nodepool Branch: refs/heads/master
-rw-r--r--doc/source/configuration.rst6
-rwxr-xr-xnodepool/builder.py85
-rw-r--r--nodepool/cmd/config_validator.py1
-rwxr-xr-xnodepool/config.py5
-rw-r--r--nodepool/tests/__init__.py19
-rw-r--r--nodepool/tests/fixtures/config_validate/good.yaml1
-rw-r--r--nodepool/tests/fixtures/diskimage_build_timeout.yaml27
-rw-r--r--nodepool/tests/unit/test_builder.py8
-rw-r--r--releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml6
9 files changed, 139 insertions, 19 deletions
diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst
index 34dcdd8..ba442c0 100644
--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@@ -277,6 +277,12 @@ Options
277 Specifies the distro to be used as a base image to build the image using 277 Specifies the distro to be used as a base image to build the image using
278 diskimage-builder. 278 diskimage-builder.
279 279
280 .. attr:: build-timeout
281 :type: int
282
283 How long (in seconds) to wait for the diskimage build before giving up.
284 The default is 8 hours.
285
280 .. attr:: elements 286 .. attr:: elements
281 :type: list 287 :type: list
282 288
diff --git a/nodepool/builder.py b/nodepool/builder.py
index dce709e..196e6e0 100755
--- a/nodepool/builder.py
+++ b/nodepool/builder.py
@@ -13,8 +13,10 @@
13# License for the specific language governing permissions and limitations 13# License for the specific language governing permissions and limitations
14# under the License. 14# under the License.
15 15
16import fcntl
16import logging 17import logging
17import os 18import os
19import select
18import shutil 20import shutil
19import socket 21import socket
20import subprocess 22import subprocess
@@ -42,6 +44,9 @@ SUSPEND_WAIT_TIME = 30
42# so just hardcode it for all qcow2 building 44# so just hardcode it for all qcow2 building
43DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'" 45DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'"
44 46
47# DIB process polling timeout, in milliseconds
48BUILD_PROCESS_POLL_TIMEOUT = 30 * 1000
49
45 50
46class DibImageFile(object): 51class DibImageFile(object):
47 ''' 52 '''
@@ -750,6 +755,7 @@ class BuildWorker(BaseWorker):
750 self.log.info('Logging to %s' % (log_fn,)) 755 self.log.info('Logging to %s' % (log_fn,))
751 756
752 start_time = time.monotonic() 757 start_time = time.monotonic()
758
753 try: 759 try:
754 p = subprocess.Popen( 760 p = subprocess.Popen(
755 shlex.split(cmd), 761 shlex.split(cmd),
@@ -761,17 +767,73 @@ class BuildWorker(BaseWorker):
761 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) 767 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
762 ) 768 )
763 769
770 # Make subprocess stdout non-blocking
771 fd = p.stdout.fileno()
772 fl = fcntl.fcntl(fd, fcntl.F_GETFL)
773 fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
774
775 # Poll subprocess stdout for readability
776 r, w, e = select.select([fd], [], [], 0)
777 bitmask = (select.POLLIN | select.POLLHUP)
778 poll = select.poll()
779 poll.register(fd, bitmask)
780
781 rc = None
782 did_timeout = False
783 subprocess_done = False
784
785 def buildDidTimeout():
786 build_time = time.monotonic() - start_time
787 if build_time > diskimage.build_timeout:
788 return True
789 return False
790
764 with open(log_fn, 'wb') as log: 791 with open(log_fn, 'wb') as log:
765 while True:
766 ln = p.stdout.readline()
767 log.write(ln)
768 log.flush()
769 if not ln:
770 break
771 792
772 rc = p.wait() 793 # While the subprocess is running, we will loop through stdout
773 m = "Exit code: %s\n" % rc 794 # events. If we can read data, write that out to the log file.
774 log.write(m.encode('utf8')) 795 # If the subprocess completes, set the flag so we can exit and
796 # write the return code.
797 #
798 # We check for build timeouts in two places: after we have read
799 # all available data in the buffer (which will cause an early exit
800 # of the poll loop), and after a poll timeout. If we did not have
801 # a check after the reads, we would have to have a poll timeout
802 # to occur to check for a build timeout, which may not happen if
803 # there is constantly data available for reading.
804
805 while not subprocess_done:
806 for fd, event in poll.poll(BUILD_PROCESS_POLL_TIMEOUT):
807
808 # Data available for reading
809 if event & select.POLLIN:
810 data = p.stdout.read(1024)
811 while data:
812 log.write(data)
813 log.flush()
814 data = p.stdout.read(1024)
815 if buildDidTimeout():
816 break
817
818 # Subprocess complete
819 elif event & select.POLLHUP:
820 subprocess_done = True
821 rc = p.wait()
822
823 if not subprocess_done:
824 if buildDidTimeout():
825 did_timeout = True
826 rc = 1
827 self.log.error(
828 "Build timeout for image %s, build %s (log: %s)",
829 diskimage.name, build_id, log_fn)
830 p.kill()
831 break
832
833 # Subprocess finished, write return code
834 if not did_timeout:
835 m = "Exit code: %s\n" % rc
836 log.write(m.encode('utf8'))
775 837
776 # It's possible the connection to the ZK cluster could have been 838 # It's possible the connection to the ZK cluster could have been
777 # interrupted during the build. If so, wait for it to return. 839 # interrupted during the build. If so, wait for it to return.
@@ -796,9 +858,10 @@ class BuildWorker(BaseWorker):
796 self.log.info("ZooKeeper lost while building %s" % diskimage.name) 858 self.log.info("ZooKeeper lost while building %s" % diskimage.name)
797 self._zk.resetLostFlag() 859 self._zk.resetLostFlag()
798 build_data.state = zk.FAILED 860 build_data.state = zk.FAILED
799 elif p.returncode: 861 elif p.returncode or did_timeout:
800 self.log.info( 862 self.log.info(
801 "DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) 863 "DIB failed creating %s (%s) (timeout=%s)" % (
864 diskimage.name, p.returncode, did_timeout))
802 build_data.state = zk.FAILED 865 build_data.state = zk.FAILED
803 else: 866 else:
804 self.log.info("DIB image %s is built" % diskimage.name) 867 self.log.info("DIB image %s is built" % diskimage.name)
diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py
index c3dde57..f2646af 100644
--- a/nodepool/cmd/config_validator.py
+++ b/nodepool/cmd/config_validator.py
@@ -44,6 +44,7 @@ class ConfigValidator:
44 'rebuild-age': int, 44 'rebuild-age': int,
45 'env-vars': {str: str}, 45 'env-vars': {str: str},
46 'username': str, 46 'username': str,
47 'build-timeout': int,
47 } 48 }
48 49
49 webapp = { 50 webapp = {
diff --git a/nodepool/config.py b/nodepool/config.py
index 2b5e4eb..7f09961 100755
--- a/nodepool/config.py
+++ b/nodepool/config.py
@@ -118,6 +118,7 @@ class Config(ConfigValue):
118 d.image_types = set(diskimage.get('formats', [])) 118 d.image_types = set(diskimage.get('formats', []))
119 d.pause = bool(diskimage.get('pause', False)) 119 d.pause = bool(diskimage.get('pause', False))
120 d.username = diskimage.get('username', 'zuul') 120 d.username = diskimage.get('username', 'zuul')
121 d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
121 self.diskimages[d.name] = d 122 self.diskimages[d.name] = d
122 123
123 def setSecureDiskimageEnv(self, diskimages, secure_config_path): 124 def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
179 self.image_types = None 180 self.image_types = None
180 self.pause = False 181 self.pause = False
181 self.username = None 182 self.username = None
183 self.build_timeout = None
182 184
183 def __eq__(self, other): 185 def __eq__(self, other):
184 if isinstance(other, DiskImage): 186 if isinstance(other, DiskImage):
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
189 other.env_vars == self.env_vars and 191 other.env_vars == self.env_vars and
190 other.image_types == self.image_types and 192 other.image_types == self.image_types and
191 other.pause == self.pause and 193 other.pause == self.pause and
192 other.username == self.username) 194 other.username == self.username and
195 other.build_timeout == self.build_timeout)
193 return False 196 return False
194 197
195 def __repr__(self): 198 def __repr__(self):
diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py
index 6cf67ff..e4c84a7 100644
--- a/nodepool/tests/__init__.py
+++ b/nodepool/tests/__init__.py
@@ -411,20 +411,25 @@ class DBTestCase(BaseTestCase):
411 time.sleep(1) 411 time.sleep(1)
412 self.wait_for_threads() 412 self.wait_for_threads()
413 413
414 def waitForBuild(self, image_name, build_id): 414 def waitForBuild(self, image_name, build_id, states=None):
415 if states is None:
416 states = (zk.READY,)
417
415 base = "-".join([image_name, build_id]) 418 base = "-".join([image_name, build_id])
419
416 while True: 420 while True:
417 self.wait_for_threads() 421 self.wait_for_threads()
418 files = builder.DibImageFile.from_image_id( 422 build = self.zk.getBuild(image_name, build_id)
419 self._config_images_dir.path, base) 423 if build and build.state in states:
420 if files:
421 break 424 break
422 time.sleep(1) 425 time.sleep(1)
423 426
424 while True: 427 # We should only expect a dib manifest with a successful build.
428 while build.state == zk.READY:
425 self.wait_for_threads() 429 self.wait_for_threads()
426 build = self.zk.getBuild(image_name, build_id) 430 files = builder.DibImageFile.from_image_id(
427 if build and build.state == zk.READY: 431 self._config_images_dir.path, base)
432 if files:
428 break 433 break
429 time.sleep(1) 434 time.sleep(1)
430 435
diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml
index 9aba134..7e81657 100644
--- a/nodepool/tests/fixtures/config_validate/good.yaml
+++ b/nodepool/tests/fixtures/config_validate/good.yaml
@@ -152,6 +152,7 @@ diskimages:
152 - cache-devstack 152 - cache-devstack
153 release: trusty 153 release: trusty
154 rebuild-age: 3600 154 rebuild-age: 3600
155 build-timeout: 3600
155 env-vars: 156 env-vars:
156 TMPDIR: /opt/dib_tmp 157 TMPDIR: /opt/dib_tmp
157 DIB_IMAGE_CACHE: /opt/dib_cache 158 DIB_IMAGE_CACHE: /opt/dib_cache
diff --git a/nodepool/tests/fixtures/diskimage_build_timeout.yaml b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
new file mode 100644
index 0000000..b133181
--- /dev/null
+++ b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
@@ -0,0 +1,27 @@
1elements-dir: .
2images-dir: '{images_dir}'
3build-log-dir: '{build_log_dir}'
4
5zookeeper-servers:
6 - host: {zookeeper_host}
7 port: {zookeeper_port}
8 chroot: {zookeeper_chroot}
9
10labels: []
11
12providers: []
13
14diskimages:
15 - name: fake-image
16 build-timeout: 2
17 formats:
18 - tar
19 elements:
20 - fedora
21 - vm
22 release: 21
23 env-vars:
24 TMPDIR: /opt/dib_tmp
25 DIB_IMAGE_CACHE: /opt/dib_cache
26 DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
27 BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py
index c992bf7..1379ec0 100644
--- a/nodepool/tests/unit/test_builder.py
+++ b/nodepool/tests/unit/test_builder.py
@@ -16,6 +16,7 @@
16import os 16import os
17import uuid 17import uuid
18import fixtures 18import fixtures
19import mock
19 20
20from nodepool import builder, exceptions, tests 21from nodepool import builder, exceptions, tests
21from nodepool.driver.fake import provider as fakeprovider 22from nodepool.driver.fake import provider as fakeprovider
@@ -335,3 +336,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
335 336
336 self.assertEqual(build_default._formats, ['qcow2']) 337 self.assertEqual(build_default._formats, ['qcow2'])
337 self.assertEqual(build_vhd._formats, ['vhd']) 338 self.assertEqual(build_vhd._formats, ['vhd'])
339
340 @mock.patch('select.poll')
341 def test_diskimage_build_timeout(self, mock_poll):
342 configfile = self.setup_config('diskimage_build_timeout.yaml')
343 builder.BUILD_PROCESS_POLL_TIMEOUT = 500
344 self.useBuilder(configfile, cleanup_interval=0)
345 self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
diff --git a/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
new file mode 100644
index 0000000..6901903
--- /dev/null
+++ b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
@@ -0,0 +1,6 @@
1---
2features:
3 - |
4 A new option (build-timeout) has been added to the builder diskimage
5 configuration to control how long the builder should wait for image
6 builds before giving up. The default is 8 hours.