summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTobias Henkel <tobias.henkel@bmw.de>2018-12-05 10:18:41 +0100
committerTobias Henkel <tobias.henkel@bmw.de>2018-12-05 10:30:54 +0100
commit41c968e3acfb2bd7e728ac7a325feb7bc0fbd7f2 (patch)
tree78845940421e7f7c606b42a77ac354c19d2ebdb8
parent6fe861f42a22eb9e8617ce2faebb76ae6aee4552 (diff)
Make estimatedNodepoolQuotaUsed more resilient
We had the case that we stored znodes without pool or type. At least znodes without type break the quota calculation and can lead to wedged providers. So make that more resilient and log exceptions per node instead of failing the complete calculation. This way we don't wedge in case we have bogus data in zk while still being able to debug what's wrong with certain znodes. Change-Id: I4a33ffbbf3684dc3830913ed8dc7b158f2426602
Notes
Notes (review): Code-Review+2: James E. Blair <corvus@inaugust.com> Code-Review+2: David Shrewsbury <shrewsbury.dave@gmail.com> Workflow+1: David Shrewsbury <shrewsbury.dave@gmail.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Wed, 05 Dec 2018 16:14:50 +0000 Reviewed-on: https://review.openstack.org/622906 Project: openstack-infra/nodepool Branch: refs/heads/master
-rwxr-xr-xnodepool/driver/openstack/provider.py44
-rw-r--r--nodepool/tests/unit/test_launcher.py33
2 files changed, 57 insertions, 20 deletions
diff --git a/nodepool/driver/openstack/provider.py b/nodepool/driver/openstack/provider.py
index a78255b..b9a1462 100755
--- a/nodepool/driver/openstack/provider.py
+++ b/nodepool/driver/openstack/provider.py
@@ -151,26 +151,30 @@ class OpenStackProvider(Provider):
151 151
152 for node in self._zk.nodeIterator(): 152 for node in self._zk.nodeIterator():
153 if node.provider == self.provider.name: 153 if node.provider == self.provider.name:
154 if pool and not node.pool == pool.name: 154 try:
155 continue 155 if pool and not node.pool == pool.name:
156 provider_pool = self.provider.pools.get(node.pool) 156 continue
157 if not provider_pool: 157 provider_pool = self.provider.pools.get(node.pool)
158 self.log.warning( 158 if not provider_pool:
159 "Cannot find provider pool for node %s" % node) 159 self.log.warning(
160 # This node is in a funny state we log it for debugging 160 "Cannot find provider pool for node %s" % node)
161 # but move on and don't account it as we can't properly 161 # This node is in a funny state we log it for debugging
162 # calculate its cost without pool info. 162 # but move on and don't account it as we can't properly
163 continue 163 # calculate its cost without pool info.
164 if node.type[0] not in provider_pool.labels: 164 continue
165 self.log.warning( 165 if node.type[0] not in provider_pool.labels:
166 "Node type is not in provider pool for node %s" % node) 166 self.log.warning("Node type is not in provider pool "
167 # This node is also in a funny state; the config 167 "for node %s" % node)
168 # may have changed under it. It should settle out 168 # This node is also in a funny state; the config
169 # eventually when it's deleted. 169 # may have changed under it. It should settle out
170 continue 170 # eventually when it's deleted.
171 node_resources = self.quotaNeededByNodeType( 171 continue
172 node.type[0], provider_pool) 172 node_resources = self.quotaNeededByNodeType(
173 used_quota.add(node_resources) 173 node.type[0], provider_pool)
174 used_quota.add(node_resources)
175 except Exception:
176 self.log.exception("Couldn't consider invalid node %s "
177 "for quota:" % node)
174 return used_quota 178 return used_quota
175 179
176 def unmanagedQuotaUsed(self): 180 def unmanagedQuotaUsed(self):
diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py
index 3422b10..acc0776 100644
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@@ -729,6 +729,39 @@ class TestLauncher(tests.DBTestCase):
729 # retries in config is set to 2, so 2 attempts to create a server 729 # retries in config is set to 2, so 2 attempts to create a server
730 self.assertEqual(0, manager.createServer_fails) 730 self.assertEqual(0, manager.createServer_fails)
731 731
732 def test_node_launch_with_broken_znodes(self):
733 """Test that node launch still works if there are broken znodes"""
734 # Create a znode without type
735 znode = zk.Node()
736 znode.provider = 'fake-provider'
737 znode.pool = 'main'
738 znode.external_id = 'fakeid'
739 znode.state = zk.READY
740
741 # Create znode without pool
742 self.zk.storeNode(znode)
743 znode = zk.Node()
744 znode.provider = 'fake-provider'
745 znode.type = ['fake-label']
746 znode.external_id = 'fakeid'
747 znode.state = zk.READY
748 self.zk.storeNode(znode)
749
750 configfile = self.setup_config('node_launch_retry.yaml')
751 pool = self.useNodepool(configfile, watermark_sleep=1)
752 self.useBuilder(configfile)
753 pool.start()
754 self.wait_for_config(pool)
755 self.waitForImage('fake-provider', 'fake-image')
756
757 req = zk.NodeRequest()
758 req.state = zk.REQUESTED
759 req.node_types.append('fake-label')
760 self.zk.storeNodeRequest(req)
761
762 req = self.waitForNodeRequest(req)
763 self.assertEqual(req.state, zk.FULFILLED)
764
732 def test_node_launch_retries_with_external_id(self): 765 def test_node_launch_retries_with_external_id(self):
733 configfile = self.setup_config('node_launch_retry.yaml') 766 configfile = self.setup_config('node_launch_retry.yaml')
734 pool = self.useNodepool(configfile, watermark_sleep=1) 767 pool = self.useNodepool(configfile, watermark_sleep=1)