summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIan Wienand <iwienand@redhat.com>2018-10-31 10:25:29 +1100
committerTobias Henkel <tobias.henkel@bmw.de>2019-01-18 15:03:55 +0100
commit0cf8144e8ce403e6c8365f17eeb774421a7bd094 (patch)
tree9baede72f2734687bac0198d28ea05032835c523
parente459ffa0fd74d26cd3f2cde3f226da71d6796b3b (diff)
Revert "Revert "Cleanup down ports""
This reverts commit 7e1b8a7261d31208d853ecfa255da648ba54b656. openstacksdk >=0.19.0 fixes the filtering problems leading to all ports being deleted. However openstacksdk <0.21.0 has problems with dogpile.cache so use 0.21.0 as a minimum. Change-Id: Id642d074cbb645ced5342dda4a1c89987c91a8fc
Notes
Notes (review): Code-Review+2: David Shrewsbury <shrewsbury.dave@gmail.com> Code-Review+2: Tobias Henkel <tobias.henkel@bmw.de> Workflow+1: James E. Blair <corvus@inaugust.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Thu, 24 Jan 2019 16:15:33 +0000 Reviewed-on: https://review.openstack.org/614370 Project: openstack-infra/nodepool Branch: refs/heads/master
-rw-r--r--doc/source/operation.rst6
-rw-r--r--nodepool/driver/fake/provider.py21
-rwxr-xr-xnodepool/driver/openstack/provider.py79
-rw-r--r--nodepool/tests/unit/test_launcher.py21
-rw-r--r--releasenotes/notes/port-cleanup-667d476437f03358.yaml7
-rw-r--r--requirements.txt5
6 files changed, 135 insertions, 4 deletions
diff --git a/doc/source/operation.rst b/doc/source/operation.rst
index 80d96cf..470e9f9 100644
--- a/doc/source/operation.rst
+++ b/doc/source/operation.rst
@@ -329,6 +329,12 @@ Nodepool launcher
329 * ready 329 * ready
330 * used 330 * used
331 331
332.. zuul:stat:: nodepool.provider.<provider>.downPorts
333 :type: counter
334
335 Number of ports in the DOWN state that have been removed automatically
336 in the cleanup resources phase of the OpenStack driver.
337
332.. zuul:stat:: nodepool.provider.<provider>.nodes.<state> 338.. zuul:stat:: nodepool.provider.<provider>.nodes.<state>
333 :type: gauge 339 :type: gauge
334 340
diff --git a/nodepool/driver/fake/provider.py b/nodepool/driver/fake/provider.py
index 5d67914..55210db 100644
--- a/nodepool/driver/fake/provider.py
+++ b/nodepool/driver/fake/provider.py
@@ -32,6 +32,7 @@ class Dummy(object):
32 INSTANCE = 'Instance' 32 INSTANCE = 'Instance'
33 FLAVOR = 'Flavor' 33 FLAVOR = 'Flavor'
34 LOCATION = 'Server.Location' 34 LOCATION = 'Server.Location'
35 PORT = 'Port'
35 36
36 def __init__(self, kind, **kw): 37 def __init__(self, kind, **kw):
37 self.__kind = kind 38 self.__kind = kind
@@ -105,6 +106,12 @@ class FakeOpenStackCloud(object):
105 self._server_list = [] 106 self._server_list = []
106 self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\ 107 self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
107 _get_quota() 108 _get_quota()
109 self._down_ports = [
110 Dummy(Dummy.PORT, id='1a', status='DOWN',
111 device_owner="compute:nova"),
112 Dummy(Dummy.PORT, id='2b', status='DOWN',
113 device_owner=None),
114 ]
108 115
109 def _get(self, name_or_id, instance_list): 116 def _get(self, name_or_id, instance_list):
110 self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list))) 117 self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
@@ -286,6 +293,20 @@ class FakeOpenStackCloud(object):
286 total_ram_used=8192 * len(self._server_list) 293 total_ram_used=8192 * len(self._server_list)
287 ) 294 )
288 295
296 def list_ports(self, filters=None):
297 if filters and filters.get('status') == 'DOWN':
298 return self._down_ports
299 return []
300
301 def delete_port(self, port_id):
302 tmp_ports = []
303 for port in self._down_ports:
304 if port.id != port_id:
305 tmp_ports.append(port)
306 else:
307 self.log.debug("Deleted port ID: %s", port_id)
308 self._down_ports = tmp_ports
309
289 310
290class FakeUploadFailCloud(FakeOpenStackCloud): 311class FakeUploadFailCloud(FakeOpenStackCloud):
291 log = logging.getLogger("nodepool.FakeUploadFailCloud") 312 log = logging.getLogger("nodepool.FakeUploadFailCloud")
diff --git a/nodepool/driver/openstack/provider.py b/nodepool/driver/openstack/provider.py
index b9a1462..f73b855 100755
--- a/nodepool/driver/openstack/provider.py
+++ b/nodepool/driver/openstack/provider.py
@@ -26,6 +26,7 @@ from nodepool.driver import Provider
26from nodepool.driver.utils import QuotaInformation 26from nodepool.driver.utils import QuotaInformation
27from nodepool.nodeutils import iterate_timeout 27from nodepool.nodeutils import iterate_timeout
28from nodepool.task_manager import TaskManager 28from nodepool.task_manager import TaskManager
29from nodepool import stats
29from nodepool import version 30from nodepool import version
30from nodepool import zk 31from nodepool import zk
31 32
@@ -50,6 +51,10 @@ class OpenStackProvider(Provider):
50 self._taskmanager = None 51 self._taskmanager = None
51 self._current_nodepool_quota = None 52 self._current_nodepool_quota = None
52 self._zk = None 53 self._zk = None
54 self._down_ports = set()
55 self._last_port_cleanup = None
56 self._port_cleanup_interval_secs = 180
57 self._statsd = stats.get_client()
53 58
54 def start(self, zk_conn): 59 def start(self, zk_conn):
55 if self._use_taskmanager: 60 if self._use_taskmanager:
@@ -428,6 +433,21 @@ class OpenStackProvider(Provider):
428 **meta) 433 **meta)
429 return image.id 434 return image.id
430 435
436 def listPorts(self, status=None):
437 '''
438 List known ports.
439
440 :param str status: A valid port status. E.g., 'ACTIVE' or 'DOWN'.
441 '''
442 if status:
443 ports = self._client.list_ports(filters={'status': status})
444 else:
445 ports = self._client.list_ports()
446 return ports
447
448 def deletePort(self, port_id):
449 self._client.delete_port(port_id)
450
431 def listImages(self): 451 def listImages(self):
432 return self._client.list_images() 452 return self._client.list_images()
433 453
@@ -455,7 +475,7 @@ class OpenStackProvider(Provider):
455 self.log.debug('Deleting server %s' % server_id) 475 self.log.debug('Deleting server %s' % server_id)
456 self.deleteServer(server_id) 476 self.deleteServer(server_id)
457 477
458 def cleanupLeakedResources(self): 478 def cleanupLeakedInstances(self):
459 ''' 479 '''
460 Delete any leaked server instances. 480 Delete any leaked server instances.
461 481
@@ -503,6 +523,63 @@ class OpenStackProvider(Provider):
503 node.state = zk.DELETING 523 node.state = zk.DELETING
504 self._zk.storeNode(node) 524 self._zk.storeNode(node)
505 525
526 def filterComputePorts(self, ports):
527 '''
528 Return a list of compute ports (or no device owner).
529
530 We are not interested in ports for routers or DHCP.
531 '''
532 ret = []
533 for p in ports:
534 if p.device_owner is None or p.device_owner.startswith("compute:"):
535 ret.append(p)
536 return ret
537
538 def cleanupLeakedPorts(self):
539 if not self._last_port_cleanup:
540 self._last_port_cleanup = time.monotonic()
541 ports = self.listPorts(status='DOWN')
542 ports = self.filterComputePorts(ports)
543 self._down_ports = set([p.id for p in ports])
544 return
545
546 # Return if not enough time has passed between cleanup
547 last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
548 if last_check_in_secs <= self._port_cleanup_interval_secs:
549 return
550
551 ports = self.listPorts(status='DOWN')
552 ports = self.filterComputePorts(ports)
553 current_set = set([p.id for p in ports])
554 remove_set = current_set & self._down_ports
555
556 removed_count = 0
557 for port_id in remove_set:
558 try:
559 self.deletePort(port_id)
560 except Exception:
561 self.log.exception("Exception deleting port %s in %s:",
562 port_id, self.provider.name)
563 else:
564 removed_count += 1
565 self.log.debug("Removed DOWN port %s in %s",
566 port_id, self.provider.name)
567
568 if self._statsd and removed_count:
569 key = 'nodepool.provider.%s.downPorts' % (self.provider.name)
570 self._statsd.incr(key, removed_count)
571
572 self._last_port_cleanup = time.monotonic()
573
574 # Rely on OpenStack to tell us the down ports rather than doing our
575 # own set adjustment.
576 ports = self.listPorts(status='DOWN')
577 ports = self.filterComputePorts(ports)
578 self._down_ports = set([p.id for p in ports])
579
580 def cleanupLeakedResources(self):
581 self.cleanupLeakedInstances()
582 self.cleanupLeakedPorts()
506 if self.provider.clean_floating_ips: 583 if self.provider.clean_floating_ips:
507 self._client.delete_unattached_floating_ips() 584 self._client.delete_unattached_floating_ips()
508 585
diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py
index 2efea9d..753cd72 100644
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@@ -1935,3 +1935,24 @@ class TestLauncher(tests.DBTestCase):
1935 1935
1936 while self.zk.client.exists(path): 1936 while self.zk.client.exists(path):
1937 time.sleep(.1) 1937 time.sleep(.1)
1938
1939 def test_leaked_port_cleanup(self):
1940 configfile = self.setup_config('node.yaml')
1941 self.useBuilder(configfile)
1942 pool = self.useNodepool(configfile, watermark_sleep=1)
1943 pool.cleanup_interval = 1
1944 pool.start()
1945 self.waitForNodes('fake-label')
1946
1947 manager = pool.getProviderManager('fake-provider')
1948 down_ports = manager.listPorts(status='DOWN')
1949 self.assertEqual(2, len(down_ports))
1950 self.log.debug("Down ports: %s", down_ports)
1951
1952 # Change the port cleanup interval to happen quicker
1953 manager._port_cleanup_interval_secs = 2
1954 while manager.listPorts(status='DOWN'):
1955 time.sleep(1)
1956
1957 self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
1958 value='2', kind='c')
diff --git a/releasenotes/notes/port-cleanup-667d476437f03358.yaml b/releasenotes/notes/port-cleanup-667d476437f03358.yaml
new file mode 100644
index 0000000..e89e375
--- /dev/null
+++ b/releasenotes/notes/port-cleanup-667d476437f03358.yaml
@@ -0,0 +1,7 @@
1---
2features:
3 - |
4 Added a new routine to the OpenStack driver cleanup resources phase that
5 will remove any ports reported to be in the DOWN state. Ports will have
6 to be seen as DOWN for at least three minutes before they will be removed.
7 The number of ports removed will be reported to statsd.
diff --git a/requirements.txt b/requirements.txt
index e65ee3c..f310349 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,9 +6,8 @@ python-daemon>=2.0.4,<2.1.0
6extras 6extras
7statsd>=3.0 7statsd>=3.0
8PrettyTable>=0.6,<0.8 8PrettyTable>=0.6,<0.8
9# openstacksdk 0.20.0 exposes a bug in keystoneauth. The pin can be removed 9# openstacksdk before 0.21.0 has issues with dogpile.cache
10# once keystoneauth1 has made a release 10openstacksdk>=0.21.0
11openstacksdk>=0.17.2,!=0.18.0,!=0.20.0
12diskimage-builder>=2.0.0 11diskimage-builder>=2.0.0
13voluptuous 12voluptuous
14kazoo 13kazoo