Fix for image build leaks
If, during a long DIB image build, we lose the ZooKeeper session, it's likely that the CleanupWorker thread could have run and removed the ZK record for the build (its state would be BUILDING and unlocked, indicating something went wrong). In that scenario, when the DIB process finishes (possibly writing out DIB files), it will never get cleaned up since the ZK record would now be gone. If we fail to update the ZK record at the end of the build, just delete the leaked DIB files immediately after the build. Change-Id: I5cb58318efe51b5b0c3413b7a01f02a50215a8b6
This commit is contained in:
parent
8421476a51
commit
fa2d4bd17c
|
@ -638,20 +638,55 @@ class BuildWorker(BaseWorker):
|
|||
return
|
||||
|
||||
self.log.info("Building image %s" % diskimage.name)
|
||||
|
||||
data = zk.ImageBuild()
|
||||
data.state = zk.BUILDING
|
||||
data.builder_id = self._builder_id
|
||||
data.builder = self._hostname
|
||||
data.formats = list(diskimage.image_types)
|
||||
|
||||
bnum = self._zk.storeBuild(diskimage.name, data)
|
||||
data = self._buildImage(bnum, diskimage)
|
||||
self._zk.storeBuild(diskimage.name, data, bnum)
|
||||
self._buildWrapper(diskimage)
|
||||
except exceptions.ZKLockException:
|
||||
# Lock is already held. Skip it.
|
||||
pass
|
||||
|
||||
def _buildWrapper(self, diskimage):
|
||||
'''
|
||||
Wraps logic for disk image building and ZooKeeper recording.
|
||||
|
||||
:returns: The updated ImageBuild data structure.
|
||||
'''
|
||||
data = zk.ImageBuild()
|
||||
data.state = zk.BUILDING
|
||||
data.builder_id = self._builder_id
|
||||
data.builder = self._hostname
|
||||
data.formats = list(diskimage.image_types)
|
||||
|
||||
bnum = self._zk.storeBuild(diskimage.name, data)
|
||||
|
||||
try:
|
||||
data = self._buildImage(bnum, diskimage)
|
||||
except Exception:
|
||||
# If something unexpected happens, make sure we clean up any
|
||||
# build cruft by executing the fallthrough below.
|
||||
self.log.exception("Image build failure for build %s of image %s:",
|
||||
bnum, diskimage.name)
|
||||
data.state = zk.FAILED
|
||||
|
||||
# If we lost the session during the long running build, this means
|
||||
# we've lost our lock and the node was possibly cleaned up during the
|
||||
# long build time because state was BUILDING and unlocked. And because
|
||||
# the cleanup likely did not get the in-progress dib build files, we
|
||||
# need to make sure we get the rest here.
|
||||
try:
|
||||
self._zk.storeBuild(diskimage.name, data, bnum)
|
||||
except Exception:
|
||||
# If it disappeared, or the save otherwise failed, cleanup the
|
||||
# leaked build manually.
|
||||
self.log.exception(
|
||||
"Unable to update record for build %s of image %s:",
|
||||
bnum, diskimage.name)
|
||||
data.id = bnum
|
||||
CleanupWorker.deleteLocalBuild(
|
||||
self._config.imagesdir, diskimage.name, data, self.log)
|
||||
data.state = zk.FAILED
|
||||
return data
|
||||
|
||||
return self._zk.getBuild(diskimage.name, bnum)
|
||||
|
||||
def _checkForManualBuildRequest(self):
|
||||
'''
|
||||
Query ZooKeeper for any manual image build requests.
|
||||
|
@ -689,16 +724,7 @@ class BuildWorker(BaseWorker):
|
|||
|
||||
self.log.info(
|
||||
"Manual build request for image %s" % diskimage.name)
|
||||
|
||||
data = zk.ImageBuild()
|
||||
data.state = zk.BUILDING
|
||||
data.builder_id = self._builder_id
|
||||
data.builder = self._hostname
|
||||
data.formats = list(diskimage.image_types)
|
||||
|
||||
bnum = self._zk.storeBuild(diskimage.name, data)
|
||||
data = self._buildImage(bnum, diskimage)
|
||||
self._zk.storeBuild(diskimage.name, data, bnum)
|
||||
data = self._buildWrapper(diskimage)
|
||||
|
||||
# Remove request on a successful build
|
||||
if data.state == zk.READY:
|
||||
|
|
|
@ -535,9 +535,10 @@ class DBTestCase(BaseTestCase):
|
|||
return app
|
||||
|
||||
def useBuilder(self, configfile, securefile=None, cleanup_interval=.5):
|
||||
self.useFixture(
|
||||
builder_fixture = self.useFixture(
|
||||
BuilderFixture(configfile, cleanup_interval, securefile)
|
||||
)
|
||||
return builder_fixture.builder
|
||||
|
||||
def setupZK(self):
|
||||
f = ZookeeperServerFixture()
|
||||
|
|
|
@ -61,14 +61,23 @@ done
|
|||
if [ -z "$outfile" ]; then
|
||||
echo "No output file specified."
|
||||
exit 1
|
||||
else
|
||||
for outtype in ${outtypes[@]} ; do
|
||||
echo "fake-data" > $outfile.$outtype
|
||||
echo "10da41d43d4bd6d67db763616c18b72f" > $outfile.$outtype.md5
|
||||
echo "0033e9d444953d11689b5fa6a6dba32bf901582f62b0825bc35f593190b1f7dc" > $outfile.$outtype.sha256
|
||||
done
|
||||
fi
|
||||
|
||||
# Tests can pause this script by creating this file before it runs.
|
||||
# No dib files should be created yet at the pause point.
|
||||
tmpdir=$(dirname $outfile)
|
||||
pause_file="$tmpdir/fake-image-create.pause"
|
||||
while [ -f "$pause_file" ]
|
||||
do
|
||||
sleep .1
|
||||
done
|
||||
|
||||
for outtype in ${outtypes[@]} ; do
|
||||
echo "fake-data" > $outfile.$outtype
|
||||
echo "10da41d43d4bd6d67db763616c18b72f" > $outfile.$outtype.md5
|
||||
echo "0033e9d444953d11689b5fa6a6dba32bf901582f62b0825bc35f593190b1f7dc" > $outfile.$outtype.sha256
|
||||
done
|
||||
|
||||
# Emulate manifest creation
|
||||
mkdir $outfile.d
|
||||
|
||||
|
@ -78,3 +87,6 @@ if [[ "${SHOULD_FAIL}" == 'true' ]]; then
|
|||
fi
|
||||
|
||||
echo "*** fake-image-create: done"
|
||||
|
||||
# Tests might need to know when this process is completed
|
||||
touch $tmpdir/fake-image-create.done
|
||||
|
|
|
@ -17,6 +17,7 @@ import os
|
|||
import uuid
|
||||
import fixtures
|
||||
import mock
|
||||
import time
|
||||
|
||||
from nodepool import builder, exceptions, tests
|
||||
from nodepool.driver.fake import provider as fakeprovider
|
||||
|
@ -350,3 +351,46 @@ class TestNodePoolBuilder(tests.DBTestCase):
|
|||
builder.BUILD_PROCESS_POLL_TIMEOUT = 500
|
||||
self.useBuilder(configfile, cleanup_interval=0)
|
||||
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
|
||||
|
||||
def test_session_loss_during_build(self):
|
||||
configfile = self.setup_config('node.yaml')
|
||||
|
||||
# We need to make the image build process pause so we can introduce
|
||||
# a simulated ZK session loss. The fake dib process will sleep while
|
||||
# the pause file is present in the images directory supplied to it.
|
||||
pause_file = os.path.join(self._config_images_dir.path,
|
||||
"fake-image-create.pause")
|
||||
open(pause_file, 'w')
|
||||
|
||||
# Disable cleanup thread to verify builder cleans up after itself
|
||||
bldr = self.useBuilder(configfile, cleanup_interval=0)
|
||||
self.waitForBuild('fake-image', '0000000001', states=(zk.BUILDING,))
|
||||
|
||||
# The build should now be paused just before writing out any DIB files.
|
||||
# Mock the next call to storeBuild() which is supposed to be the update
|
||||
# of the current build in ZooKeeper. This failure simulates losing the
|
||||
# ZK session and not being able to update the record.
|
||||
bldr.zk.storeBuild = mock.Mock(side_effect=Exception('oops'))
|
||||
|
||||
# Allow the fake-image-create to continue by removing the pause file
|
||||
os.remove(pause_file)
|
||||
|
||||
# The fake dib process writes out a .done file at the end. We need
|
||||
# this so we do not continue in this test until *after* all files are
|
||||
# written by that dib process.
|
||||
done_file = os.path.join(self._config_images_dir.path,
|
||||
"fake-image-create.done")
|
||||
while not os.path.exists(done_file):
|
||||
time.sleep(.1)
|
||||
|
||||
# There shouldn't be any DIB files even though cleanup thread is
|
||||
# disabled because the builder should clean up after itself.
|
||||
images_dir = bldr._config.imagesdir
|
||||
|
||||
# Wait for builder to remove the leaked files
|
||||
image_files = builder.DibImageFile.from_image_id(
|
||||
images_dir, 'fake-image-0000000001')
|
||||
while image_files:
|
||||
time.sleep(.1)
|
||||
image_files = builder.DibImageFile.from_image_id(
|
||||
images_dir, 'fake-image-0000000001')
|
||||
|
|
Loading…
Reference in New Issue