Fix for image build leaks

If, during a long DIB image build, we lose the ZooKeeper session,
it's likely that the CleanupWorker thread could have run and removed
the ZK record for the build (its state would be BUILDING and unlocked,
indicating something went wrong). In that scenario, when the DIB
process finishes (possibly writing out DIB files), it will never get
cleaned up since the ZK record would now be gone. If we fail to update
the ZK record at the end of the build, just delete the leaked DIB files
immediately after the build.

Change-Id: I5cb58318efe51b5b0c3413b7a01f02a50215a8b6
This commit is contained in:
David Shrewsbury 2019-03-25 16:37:48 -04:00
parent 8421476a51
commit fa2d4bd17c
4 changed files with 110 additions and 27 deletions

View File

@ -638,20 +638,55 @@ class BuildWorker(BaseWorker):
return
self.log.info("Building image %s" % diskimage.name)
data = zk.ImageBuild()
data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname
data.formats = list(diskimage.image_types)
bnum = self._zk.storeBuild(diskimage.name, data)
data = self._buildImage(bnum, diskimage)
self._zk.storeBuild(diskimage.name, data, bnum)
self._buildWrapper(diskimage)
except exceptions.ZKLockException:
# Lock is already held. Skip it.
pass
def _buildWrapper(self, diskimage):
'''
Wraps logic for disk image building and ZooKeeper recording.
:returns: The updated ImageBuild data structure.
'''
data = zk.ImageBuild()
data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname
data.formats = list(diskimage.image_types)
bnum = self._zk.storeBuild(diskimage.name, data)
try:
data = self._buildImage(bnum, diskimage)
except Exception:
# If something unexpected happens, make sure we clean up any
# build cruft by executing the fallthrough below.
self.log.exception("Image build failure for build %s of image %s:",
bnum, diskimage.name)
data.state = zk.FAILED
# If we lost the session during the long running build, this means
# we've lost our lock and the node was possibly cleaned up during the
# long build time because state was BUILDING and unlocked. And because
# the cleanup likely did not get the in-progress dib build files, we
# need to make sure we get the rest here.
try:
self._zk.storeBuild(diskimage.name, data, bnum)
except Exception:
# If it disappeared, or the save otherwise failed, cleanup the
# leaked build manually.
self.log.exception(
"Unable to update record for build %s of image %s:",
bnum, diskimage.name)
data.id = bnum
CleanupWorker.deleteLocalBuild(
self._config.imagesdir, diskimage.name, data, self.log)
data.state = zk.FAILED
return data
return self._zk.getBuild(diskimage.name, bnum)
def _checkForManualBuildRequest(self):
'''
Query ZooKeeper for any manual image build requests.
@ -689,16 +724,7 @@ class BuildWorker(BaseWorker):
self.log.info(
"Manual build request for image %s" % diskimage.name)
data = zk.ImageBuild()
data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname
data.formats = list(diskimage.image_types)
bnum = self._zk.storeBuild(diskimage.name, data)
data = self._buildImage(bnum, diskimage)
self._zk.storeBuild(diskimage.name, data, bnum)
data = self._buildWrapper(diskimage)
# Remove request on a successful build
if data.state == zk.READY:

View File

@ -535,9 +535,10 @@ class DBTestCase(BaseTestCase):
return app
def useBuilder(self, configfile, securefile=None, cleanup_interval=.5):
self.useFixture(
builder_fixture = self.useFixture(
BuilderFixture(configfile, cleanup_interval, securefile)
)
return builder_fixture.builder
def setupZK(self):
f = ZookeeperServerFixture()

View File

@ -61,14 +61,23 @@ done
if [ -z "$outfile" ]; then
echo "No output file specified."
exit 1
else
for outtype in ${outtypes[@]} ; do
echo "fake-data" > $outfile.$outtype
echo "10da41d43d4bd6d67db763616c18b72f" > $outfile.$outtype.md5
echo "0033e9d444953d11689b5fa6a6dba32bf901582f62b0825bc35f593190b1f7dc" > $outfile.$outtype.sha256
done
fi
# Tests can pause this script by creating this file before it runs.
# No dib files should be created yet at the pause point.
tmpdir=$(dirname $outfile)
pause_file="$tmpdir/fake-image-create.pause"
while [ -f "$pause_file" ]
do
sleep .1
done
for outtype in ${outtypes[@]} ; do
echo "fake-data" > $outfile.$outtype
echo "10da41d43d4bd6d67db763616c18b72f" > $outfile.$outtype.md5
echo "0033e9d444953d11689b5fa6a6dba32bf901582f62b0825bc35f593190b1f7dc" > $outfile.$outtype.sha256
done
# Emulate manifest creation
mkdir $outfile.d
@ -78,3 +87,6 @@ if [[ "${SHOULD_FAIL}" == 'true' ]]; then
fi
echo "*** fake-image-create: done"
# Tests might need to know when this process is completed
touch $tmpdir/fake-image-create.done

View File

@ -17,6 +17,7 @@ import os
import uuid
import fixtures
import mock
import time
from nodepool import builder, exceptions, tests
from nodepool.driver.fake import provider as fakeprovider
@ -350,3 +351,46 @@ class TestNodePoolBuilder(tests.DBTestCase):
builder.BUILD_PROCESS_POLL_TIMEOUT = 500
self.useBuilder(configfile, cleanup_interval=0)
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
def test_session_loss_during_build(self):
configfile = self.setup_config('node.yaml')
# We need to make the image build process pause so we can introduce
# a simulated ZK session loss. The fake dib process will sleep while
# the pause file is present in the images directory supplied to it.
pause_file = os.path.join(self._config_images_dir.path,
"fake-image-create.pause")
open(pause_file, 'w')
# Disable cleanup thread to verify builder cleans up after itself
bldr = self.useBuilder(configfile, cleanup_interval=0)
self.waitForBuild('fake-image', '0000000001', states=(zk.BUILDING,))
# The build should now be paused just before writing out any DIB files.
# Mock the next call to storeBuild() which is supposed to be the update
# of the current build in ZooKeeper. This failure simulates losing the
# ZK session and not being able to update the record.
bldr.zk.storeBuild = mock.Mock(side_effect=Exception('oops'))
# Allow the fake-image-create to continue by removing the pause file
os.remove(pause_file)
# The fake dib process writes out a .done file at the end. We need
# this so we do not continue in this test until *after* all files are
# written by that dib process.
done_file = os.path.join(self._config_images_dir.path,
"fake-image-create.done")
while not os.path.exists(done_file):
time.sleep(.1)
# There shouldn't be any DIB files even though cleanup thread is
# disabled because the builder should clean up after itself.
images_dir = bldr._config.imagesdir
# Wait for builder to remove the leaked files
image_files = builder.DibImageFile.from_image_id(
images_dir, 'fake-image-0000000001')
while image_files:
time.sleep(.1)
image_files = builder.DibImageFile.from_image_id(
images_dir, 'fake-image-0000000001')