Merge "Fix for image build leaks"

This commit is contained in:
Zuul 2019-04-03 16:12:31 +00:00 committed by Gerrit Code Review
commit 9a223c4f67
4 changed files with 110 additions and 27 deletions

View File

@ -638,20 +638,55 @@ class BuildWorker(BaseWorker):
return
self.log.info("Building image %s" % diskimage.name)
data = zk.ImageBuild()
data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname
data.formats = list(diskimage.image_types)
bnum = self._zk.storeBuild(diskimage.name, data)
data = self._buildImage(bnum, diskimage)
self._zk.storeBuild(diskimage.name, data, bnum)
self._buildWrapper(diskimage)
except exceptions.ZKLockException:
# Lock is already held. Skip it.
pass
def _buildWrapper(self, diskimage):
'''
Wraps logic for disk image building and ZooKeeper recording.
:returns: The updated ImageBuild data structure.
'''
data = zk.ImageBuild()
data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname
data.formats = list(diskimage.image_types)
bnum = self._zk.storeBuild(diskimage.name, data)
try:
data = self._buildImage(bnum, diskimage)
except Exception:
# If something unexpected happens, make sure we clean up any
# build cruft by executing the fallthrough below.
self.log.exception("Image build failure for build %s of image %s:",
bnum, diskimage.name)
data.state = zk.FAILED
# If we lost the session during the long running build, this means
# we've lost our lock and the node was possibly cleaned up during the
# long build time because state was BUILDING and unlocked. And because
# the cleanup likely did not get the in-progress dib build files, we
# need to make sure we get the rest here.
try:
self._zk.storeBuild(diskimage.name, data, bnum)
except Exception:
# If it disappeared, or the save otherwise failed, cleanup the
# leaked build manually.
self.log.exception(
"Unable to update record for build %s of image %s:",
bnum, diskimage.name)
data.id = bnum
CleanupWorker.deleteLocalBuild(
self._config.imagesdir, diskimage.name, data, self.log)
data.state = zk.FAILED
return data
return self._zk.getBuild(diskimage.name, bnum)
def _checkForManualBuildRequest(self):
'''
Query ZooKeeper for any manual image build requests.
@ -689,16 +724,7 @@ class BuildWorker(BaseWorker):
self.log.info(
"Manual build request for image %s" % diskimage.name)
data = zk.ImageBuild()
data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname
data.formats = list(diskimage.image_types)
bnum = self._zk.storeBuild(diskimage.name, data)
data = self._buildImage(bnum, diskimage)
self._zk.storeBuild(diskimage.name, data, bnum)
data = self._buildWrapper(diskimage)
# Remove request on a successful build
if data.state == zk.READY:

View File

@ -535,9 +535,10 @@ class DBTestCase(BaseTestCase):
return app
def useBuilder(self, configfile, securefile=None, cleanup_interval=.5):
self.useFixture(
builder_fixture = self.useFixture(
BuilderFixture(configfile, cleanup_interval, securefile)
)
return builder_fixture.builder
def setupZK(self):
f = ZookeeperServerFixture()

View File

@ -61,14 +61,23 @@ done
if [ -z "$outfile" ]; then
echo "No output file specified."
exit 1
else
for outtype in ${outtypes[@]} ; do
echo "fake-data" > $outfile.$outtype
echo "10da41d43d4bd6d67db763616c18b72f" > $outfile.$outtype.md5
echo "0033e9d444953d11689b5fa6a6dba32bf901582f62b0825bc35f593190b1f7dc" > $outfile.$outtype.sha256
done
fi
# Tests can pause this script by creating this file before it runs.
# No dib files should be created yet at the pause point.
tmpdir=$(dirname $outfile)
pause_file="$tmpdir/fake-image-create.pause"
while [ -f "$pause_file" ]
do
sleep .1
done
for outtype in ${outtypes[@]} ; do
echo "fake-data" > $outfile.$outtype
echo "10da41d43d4bd6d67db763616c18b72f" > $outfile.$outtype.md5
echo "0033e9d444953d11689b5fa6a6dba32bf901582f62b0825bc35f593190b1f7dc" > $outfile.$outtype.sha256
done
# Emulate manifest creation
mkdir $outfile.d
@ -78,3 +87,6 @@ if [[ "${SHOULD_FAIL}" == 'true' ]]; then
fi
echo "*** fake-image-create: done"
# Tests might need to know when this process is completed
touch $tmpdir/fake-image-create.done

View File

@ -17,6 +17,7 @@ import os
import uuid
import fixtures
import mock
import time
from nodepool import builder, exceptions, tests
from nodepool.driver.fake import provider as fakeprovider
@ -350,3 +351,46 @@ class TestNodePoolBuilder(tests.DBTestCase):
builder.BUILD_PROCESS_POLL_TIMEOUT = 500
self.useBuilder(configfile, cleanup_interval=0)
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
def test_session_loss_during_build(self):
configfile = self.setup_config('node.yaml')
# We need to make the image build process pause so we can introduce
# a simulated ZK session loss. The fake dib process will sleep while
# the pause file is present in the images directory supplied to it.
pause_file = os.path.join(self._config_images_dir.path,
"fake-image-create.pause")
open(pause_file, 'w')
# Disable cleanup thread to verify builder cleans up after itself
bldr = self.useBuilder(configfile, cleanup_interval=0)
self.waitForBuild('fake-image', '0000000001', states=(zk.BUILDING,))
# The build should now be paused just before writing out any DIB files.
# Mock the next call to storeBuild() which is supposed to be the update
# of the current build in ZooKeeper. This failure simulates losing the
# ZK session and not being able to update the record.
bldr.zk.storeBuild = mock.Mock(side_effect=Exception('oops'))
# Allow the fake-image-create to continue by removing the pause file
os.remove(pause_file)
# The fake dib process writes out a .done file at the end. We need
# this so we do not continue in this test until *after* all files are
# written by that dib process.
done_file = os.path.join(self._config_images_dir.path,
"fake-image-create.done")
while not os.path.exists(done_file):
time.sleep(.1)
# There shouldn't be any DIB files even though cleanup thread is
# disabled because the builder should clean up after itself.
images_dir = bldr._config.imagesdir
# Wait for builder to remove the leaked files
image_files = builder.DibImageFile.from_image_id(
images_dir, 'fake-image-0000000001')
while image_files:
time.sleep(.1)
image_files = builder.DibImageFile.from_image_id(
images_dir, 'fake-image-0000000001')