Set type for error'ed instances

When a server creation fails but has an external id we create a new
znode to offload the deletion of that node. This currently misses the
node type which will trigger an exception during node launch [1]. This
wedges the provider until the node deleter kicked in and deleted that
node successfully. Fix this by storing the node type in this znode.

[1] Exception
Traceback (most recent call last):
  File "nodepool/driver/__init__.py", line 639, in run
    self._runHandler()
  File "nodepool/driver/__init__.py", line 563, in _runHandler
    self._waitForNodeSet()
  File "nodepool/driver/__init__.py", line 463, in _waitForNodeSet
    if not self.hasRemainingQuota(ntype):
  File "nodepool/driver/openstack/handler.py", line 314, in hasRemainingQuota
    self.manager.estimatedNodepoolQuotaUsed())
  File "nodepool/driver/openstack/provider.py", line 164, in estimatedNodepoolQuotaUsed
    if node.type[0] not in provider_pool.labels:
IndexError: list index out of range

Change-Id: I67b269069dddb8349959802d7b1ee049a826d0c5
Co-authored-by: Tobias Henkel <tobias.henkel@bmw.de>
This commit is contained in:
Tristan Cacqueray 2018-12-04 08:44:28 +00:00 committed by Tobias Henkel
parent 1b5d416f36
commit 6fe861f42a
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
3 changed files with 42 additions and 0 deletions

View File

@ -24,6 +24,7 @@ import openstack.exceptions
from nodepool import exceptions
from nodepool.driver.openstack.provider import OpenStackProvider
from nodepool.driver.fake.handler import FakeNodeRequestHandler
from openstack.cloud.exc import OpenStackCloudCreateException
class Dummy(object):
@ -340,6 +341,7 @@ class FakeProvider(OpenStackProvider):
def __init__(self, provider, use_taskmanager):
self.createServer_fails = 0
self.createServer_fails_with_external_id = 0
self.__client = FakeProvider.fake_cloud()
super(FakeProvider, self).__init__(provider, use_taskmanager)
@ -350,6 +352,9 @@ class FakeProvider(OpenStackProvider):
while self.createServer_fails:
self.createServer_fails -= 1
raise Exception("Expected createServer exception")
while self.createServer_fails_with_external_id:
self.createServer_fails_with_external_id -= 1
raise OpenStackCloudCreateException('server', 'fakeid')
return super(FakeProvider, self).createServer(*args, **kwargs)
def getRequestHandler(self, poolworker, request):

View File

@ -254,6 +254,7 @@ class OpenStackNodeLauncher(NodeLauncher):
deleting_node = zk.Node()
deleting_node.provider = self.node.provider
deleting_node.pool = self.node.pool
deleting_node.type = self.node.type
deleting_node.external_id = self.node.external_id
deleting_node.state = zk.DELETING
self.zk.storeNode(deleting_node)

View File

@ -729,6 +729,42 @@ class TestLauncher(tests.DBTestCase):
# retries in config is set to 2, so 2 attempts to create a server
self.assertEqual(0, manager.createServer_fails)
def test_node_launch_retries_with_external_id(self):
configfile = self.setup_config('node_launch_retry.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
self.useBuilder(configfile)
pool.start()
self.wait_for_config(pool)
manager = pool.getProviderManager('fake-provider')
manager.createServer_fails_with_external_id = 2
self.waitForImage('fake-provider', 'fake-image')
# Stop the DeletedNodeWorker so we can make sure the fake znode that
# is used to delete the failed servers is still around when requesting.
# the second node.
pool._delete_thread.stop()
time.sleep(1)
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FAILED)
# retries in config is set to 2, so 2 attempts to create a server
self.assertEqual(0, manager.createServer_fails_with_external_id)
# Request another node to check if nothing is wedged
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FULFILLED)
def test_node_delete_failure(self):
def fail_delete(self, name):
raise RuntimeError('Fake Error')