summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2019-02-06 13:16:06 +0000
committerGerrit Code Review <review@openstack.org>2019-02-06 13:16:06 +0000
commit280cd5937d9b5f7c8f7830106ea2d142a8c56c82 (patch)
tree237da569d292fe75619a1c4efd6cf76542a4d5b6
parentf8f4b623ee54bda28c973650cd3a921b7a4d67ed (diff)
parent890ea4975eaafa147fd7c87dbfcdd179d3e0d7b8 (diff)
Merge "Revert "Revert "Add a timeout for the image build"""
-rw-r--r--doc/source/configuration.rst6
-rwxr-xr-xnodepool/builder.py85
-rw-r--r--nodepool/cmd/config_validator.py1
-rwxr-xr-xnodepool/config.py5
-rw-r--r--nodepool/tests/__init__.py19
-rw-r--r--nodepool/tests/fixtures/config_validate/good.yaml1
-rw-r--r--nodepool/tests/fixtures/diskimage_build_timeout.yaml27
-rw-r--r--nodepool/tests/unit/test_builder.py8
-rw-r--r--releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml6
9 files changed, 139 insertions, 19 deletions
diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst
index 34dcdd8..ba442c0 100644
--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@@ -277,6 +277,12 @@ Options
277 Specifies the distro to be used as a base image to build the image using 277 Specifies the distro to be used as a base image to build the image using
278 diskimage-builder. 278 diskimage-builder.
279 279
280 .. attr:: build-timeout
281 :type: int
282
283 How long (in seconds) to wait for the diskimage build before giving up.
284 The default is 8 hours.
285
280 .. attr:: elements 286 .. attr:: elements
281 :type: list 287 :type: list
282 288
diff --git a/nodepool/builder.py b/nodepool/builder.py
index dce709e..196e6e0 100755
--- a/nodepool/builder.py
+++ b/nodepool/builder.py
@@ -13,8 +13,10 @@
13# License for the specific language governing permissions and limitations 13# License for the specific language governing permissions and limitations
14# under the License. 14# under the License.
15 15
16import fcntl
16import logging 17import logging
17import os 18import os
19import select
18import shutil 20import shutil
19import socket 21import socket
20import subprocess 22import subprocess
@@ -42,6 +44,9 @@ SUSPEND_WAIT_TIME = 30
42# so just hardcode it for all qcow2 building 44# so just hardcode it for all qcow2 building
43DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'" 45DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'"
44 46
47# DIB process polling timeout, in milliseconds
48BUILD_PROCESS_POLL_TIMEOUT = 30 * 1000
49
45 50
46class DibImageFile(object): 51class DibImageFile(object):
47 ''' 52 '''
@@ -750,6 +755,7 @@ class BuildWorker(BaseWorker):
750 self.log.info('Logging to %s' % (log_fn,)) 755 self.log.info('Logging to %s' % (log_fn,))
751 756
752 start_time = time.monotonic() 757 start_time = time.monotonic()
758
753 try: 759 try:
754 p = subprocess.Popen( 760 p = subprocess.Popen(
755 shlex.split(cmd), 761 shlex.split(cmd),
@@ -761,17 +767,73 @@ class BuildWorker(BaseWorker):
761 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) 767 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
762 ) 768 )
763 769
770 # Make subprocess stdout non-blocking
771 fd = p.stdout.fileno()
772 fl = fcntl.fcntl(fd, fcntl.F_GETFL)
773 fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
774
775 # Poll subprocess stdout for readability
776 r, w, e = select.select([fd], [], [], 0)
777 bitmask = (select.POLLIN | select.POLLHUP)
778 poll = select.poll()
779 poll.register(fd, bitmask)
780
781 rc = None
782 did_timeout = False
783 subprocess_done = False
784
785 def buildDidTimeout():
786 build_time = time.monotonic() - start_time
787 if build_time > diskimage.build_timeout:
788 return True
789 return False
790
764 with open(log_fn, 'wb') as log: 791 with open(log_fn, 'wb') as log:
765 while True:
766 ln = p.stdout.readline()
767 log.write(ln)
768 log.flush()
769 if not ln:
770 break
771 792
772 rc = p.wait() 793 # While the subprocess is running, we will loop through stdout
773 m = "Exit code: %s\n" % rc 794 # events. If we can read data, write that out to the log file.
774 log.write(m.encode('utf8')) 795 # If the subprocess completes, set the flag so we can exit and
796 # write the return code.
797 #
798 # We check for build timeouts in two places: after we have read
799 # all available data in the buffer (which will cause an early exit
800 # of the poll loop), and after a poll timeout. If we did not have
801 # a check after the reads, we would have to have a poll timeout
802 # to occur to check for a build timeout, which may not happen if
803 # there is constantly data available for reading.
804
805 while not subprocess_done:
806 for fd, event in poll.poll(BUILD_PROCESS_POLL_TIMEOUT):
807
808 # Data available for reading
809 if event & select.POLLIN:
810 data = p.stdout.read(1024)
811 while data:
812 log.write(data)
813 log.flush()
814 data = p.stdout.read(1024)
815 if buildDidTimeout():
816 break
817
818 # Subprocess complete
819 elif event & select.POLLHUP:
820 subprocess_done = True
821 rc = p.wait()
822
823 if not subprocess_done:
824 if buildDidTimeout():
825 did_timeout = True
826 rc = 1
827 self.log.error(
828 "Build timeout for image %s, build %s (log: %s)",
829 diskimage.name, build_id, log_fn)
830 p.kill()
831 break
832
833 # Subprocess finished, write return code
834 if not did_timeout:
835 m = "Exit code: %s\n" % rc
836 log.write(m.encode('utf8'))
775 837
776 # It's possible the connection to the ZK cluster could have been 838 # It's possible the connection to the ZK cluster could have been
777 # interrupted during the build. If so, wait for it to return. 839 # interrupted during the build. If so, wait for it to return.
@@ -796,9 +858,10 @@ class BuildWorker(BaseWorker):
796 self.log.info("ZooKeeper lost while building %s" % diskimage.name) 858 self.log.info("ZooKeeper lost while building %s" % diskimage.name)
797 self._zk.resetLostFlag() 859 self._zk.resetLostFlag()
798 build_data.state = zk.FAILED 860 build_data.state = zk.FAILED
799 elif p.returncode: 861 elif p.returncode or did_timeout:
800 self.log.info( 862 self.log.info(
801 "DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) 863 "DIB failed creating %s (%s) (timeout=%s)" % (
864 diskimage.name, p.returncode, did_timeout))
802 build_data.state = zk.FAILED 865 build_data.state = zk.FAILED
803 else: 866 else:
804 self.log.info("DIB image %s is built" % diskimage.name) 867 self.log.info("DIB image %s is built" % diskimage.name)
diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py
index c3dde57..f2646af 100644
--- a/nodepool/cmd/config_validator.py
+++ b/nodepool/cmd/config_validator.py
@@ -44,6 +44,7 @@ class ConfigValidator:
44 'rebuild-age': int, 44 'rebuild-age': int,
45 'env-vars': {str: str}, 45 'env-vars': {str: str},
46 'username': str, 46 'username': str,
47 'build-timeout': int,
47 } 48 }
48 49
49 webapp = { 50 webapp = {
diff --git a/nodepool/config.py b/nodepool/config.py
index 2b5e4eb..7f09961 100755
--- a/nodepool/config.py
+++ b/nodepool/config.py
@@ -118,6 +118,7 @@ class Config(ConfigValue):
118 d.image_types = set(diskimage.get('formats', [])) 118 d.image_types = set(diskimage.get('formats', []))
119 d.pause = bool(diskimage.get('pause', False)) 119 d.pause = bool(diskimage.get('pause', False))
120 d.username = diskimage.get('username', 'zuul') 120 d.username = diskimage.get('username', 'zuul')
121 d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
121 self.diskimages[d.name] = d 122 self.diskimages[d.name] = d
122 123
123 def setSecureDiskimageEnv(self, diskimages, secure_config_path): 124 def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
179 self.image_types = None 180 self.image_types = None
180 self.pause = False 181 self.pause = False
181 self.username = None 182 self.username = None
183 self.build_timeout = None
182 184
183 def __eq__(self, other): 185 def __eq__(self, other):
184 if isinstance(other, DiskImage): 186 if isinstance(other, DiskImage):
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
189 other.env_vars == self.env_vars and 191 other.env_vars == self.env_vars and
190 other.image_types == self.image_types and 192 other.image_types == self.image_types and
191 other.pause == self.pause and 193 other.pause == self.pause and
192 other.username == self.username) 194 other.username == self.username and
195 other.build_timeout == self.build_timeout)
193 return False 196 return False
194 197
195 def __repr__(self): 198 def __repr__(self):
diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py
index bce28cd..7b860b9 100644
--- a/nodepool/tests/__init__.py
+++ b/nodepool/tests/__init__.py
@@ -414,20 +414,25 @@ class DBTestCase(BaseTestCase):
414 time.sleep(1) 414 time.sleep(1)
415 self.wait_for_threads() 415 self.wait_for_threads()
416 416
417 def waitForBuild(self, image_name, build_id): 417 def waitForBuild(self, image_name, build_id, states=None):
418 if states is None:
419 states = (zk.READY,)
420
418 base = "-".join([image_name, build_id]) 421 base = "-".join([image_name, build_id])
422
419 while True: 423 while True:
420 self.wait_for_threads() 424 self.wait_for_threads()
421 files = builder.DibImageFile.from_image_id( 425 build = self.zk.getBuild(image_name, build_id)
422 self._config_images_dir.path, base) 426 if build and build.state in states:
423 if files:
424 break 427 break
425 time.sleep(1) 428 time.sleep(1)
426 429
427 while True: 430 # We should only expect a dib manifest with a successful build.
431 while build.state == zk.READY:
428 self.wait_for_threads() 432 self.wait_for_threads()
429 build = self.zk.getBuild(image_name, build_id) 433 files = builder.DibImageFile.from_image_id(
430 if build and build.state == zk.READY: 434 self._config_images_dir.path, base)
435 if files:
431 break 436 break
432 time.sleep(1) 437 time.sleep(1)
433 438
diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml
index 3bb327b..2dcbbfd 100644
--- a/nodepool/tests/fixtures/config_validate/good.yaml
+++ b/nodepool/tests/fixtures/config_validate/good.yaml
@@ -174,6 +174,7 @@ diskimages:
174 - cache-devstack 174 - cache-devstack
175 release: trusty 175 release: trusty
176 rebuild-age: 3600 176 rebuild-age: 3600
177 build-timeout: 3600
177 env-vars: 178 env-vars:
178 TMPDIR: /opt/dib_tmp 179 TMPDIR: /opt/dib_tmp
179 DIB_IMAGE_CACHE: /opt/dib_cache 180 DIB_IMAGE_CACHE: /opt/dib_cache
diff --git a/nodepool/tests/fixtures/diskimage_build_timeout.yaml b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
new file mode 100644
index 0000000..b133181
--- /dev/null
+++ b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
@@ -0,0 +1,27 @@
1elements-dir: .
2images-dir: '{images_dir}'
3build-log-dir: '{build_log_dir}'
4
5zookeeper-servers:
6 - host: {zookeeper_host}
7 port: {zookeeper_port}
8 chroot: {zookeeper_chroot}
9
10labels: []
11
12providers: []
13
14diskimages:
15 - name: fake-image
16 build-timeout: 2
17 formats:
18 - tar
19 elements:
20 - fedora
21 - vm
22 release: 21
23 env-vars:
24 TMPDIR: /opt/dib_tmp
25 DIB_IMAGE_CACHE: /opt/dib_cache
26 DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
27 BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py
index c992bf7..1379ec0 100644
--- a/nodepool/tests/unit/test_builder.py
+++ b/nodepool/tests/unit/test_builder.py
@@ -16,6 +16,7 @@
16import os 16import os
17import uuid 17import uuid
18import fixtures 18import fixtures
19import mock
19 20
20from nodepool import builder, exceptions, tests 21from nodepool import builder, exceptions, tests
21from nodepool.driver.fake import provider as fakeprovider 22from nodepool.driver.fake import provider as fakeprovider
@@ -335,3 +336,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
335 336
336 self.assertEqual(build_default._formats, ['qcow2']) 337 self.assertEqual(build_default._formats, ['qcow2'])
337 self.assertEqual(build_vhd._formats, ['vhd']) 338 self.assertEqual(build_vhd._formats, ['vhd'])
339
340 @mock.patch('select.poll')
341 def test_diskimage_build_timeout(self, mock_poll):
342 configfile = self.setup_config('diskimage_build_timeout.yaml')
343 builder.BUILD_PROCESS_POLL_TIMEOUT = 500
344 self.useBuilder(configfile, cleanup_interval=0)
345 self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
diff --git a/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
new file mode 100644
index 0000000..6901903
--- /dev/null
+++ b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
@@ -0,0 +1,6 @@
1---
2features:
3 - |
4 A new option (build-timeout) has been added to the builder diskimage
5 configuration to control how long the builder should wait for image
6 builds before giving up. The default is 8 hours.