summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2019-01-24 16:15:33 +0000
committerGerrit Code Review <review@openstack.org>2019-01-24 16:15:33 +0000
commit78d2476769cdbca46449a9e176ff2656da075c67 (patch)
tree773b3c5c4cbd2aafb915d74d13927d6885f0168b
parent26c57ee5a90320b4a4651aecbda384ef9c97946c (diff)
parent0cf8144e8ce403e6c8365f17eeb774421a7bd094 (diff)
Merge "Revert "Revert "Cleanup down ports"""
-rw-r--r--doc/source/operation.rst6
-rw-r--r--nodepool/driver/fake/provider.py21
-rwxr-xr-xnodepool/driver/openstack/provider.py79
-rw-r--r--nodepool/tests/unit/test_launcher.py21
-rw-r--r--releasenotes/notes/port-cleanup-667d476437f03358.yaml7
-rw-r--r--requirements.txt5
6 files changed, 135 insertions, 4 deletions
diff --git a/doc/source/operation.rst b/doc/source/operation.rst
index 80d96cf..470e9f9 100644
--- a/doc/source/operation.rst
+++ b/doc/source/operation.rst
@@ -329,6 +329,12 @@ Nodepool launcher
329 * ready 329 * ready
330 * used 330 * used
331 331
332.. zuul:stat:: nodepool.provider.<provider>.downPorts
333 :type: counter
334
335 Number of ports in the DOWN state that have been removed automatically
336 in the cleanup resources phase of the OpenStack driver.
337
332.. zuul:stat:: nodepool.provider.<provider>.nodes.<state> 338.. zuul:stat:: nodepool.provider.<provider>.nodes.<state>
333 :type: gauge 339 :type: gauge
334 340
diff --git a/nodepool/driver/fake/provider.py b/nodepool/driver/fake/provider.py
index 5d67914..55210db 100644
--- a/nodepool/driver/fake/provider.py
+++ b/nodepool/driver/fake/provider.py
@@ -32,6 +32,7 @@ class Dummy(object):
32 INSTANCE = 'Instance' 32 INSTANCE = 'Instance'
33 FLAVOR = 'Flavor' 33 FLAVOR = 'Flavor'
34 LOCATION = 'Server.Location' 34 LOCATION = 'Server.Location'
35 PORT = 'Port'
35 36
36 def __init__(self, kind, **kw): 37 def __init__(self, kind, **kw):
37 self.__kind = kind 38 self.__kind = kind
@@ -105,6 +106,12 @@ class FakeOpenStackCloud(object):
105 self._server_list = [] 106 self._server_list = []
106 self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\ 107 self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
107 _get_quota() 108 _get_quota()
109 self._down_ports = [
110 Dummy(Dummy.PORT, id='1a', status='DOWN',
111 device_owner="compute:nova"),
112 Dummy(Dummy.PORT, id='2b', status='DOWN',
113 device_owner=None),
114 ]
108 115
109 def _get(self, name_or_id, instance_list): 116 def _get(self, name_or_id, instance_list):
110 self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list))) 117 self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
@@ -286,6 +293,20 @@ class FakeOpenStackCloud(object):
286 total_ram_used=8192 * len(self._server_list) 293 total_ram_used=8192 * len(self._server_list)
287 ) 294 )
288 295
296 def list_ports(self, filters=None):
297 if filters and filters.get('status') == 'DOWN':
298 return self._down_ports
299 return []
300
301 def delete_port(self, port_id):
302 tmp_ports = []
303 for port in self._down_ports:
304 if port.id != port_id:
305 tmp_ports.append(port)
306 else:
307 self.log.debug("Deleted port ID: %s", port_id)
308 self._down_ports = tmp_ports
309
289 310
290class FakeUploadFailCloud(FakeOpenStackCloud): 311class FakeUploadFailCloud(FakeOpenStackCloud):
291 log = logging.getLogger("nodepool.FakeUploadFailCloud") 312 log = logging.getLogger("nodepool.FakeUploadFailCloud")
diff --git a/nodepool/driver/openstack/provider.py b/nodepool/driver/openstack/provider.py
index 3c07c13..df7f670 100755
--- a/nodepool/driver/openstack/provider.py
+++ b/nodepool/driver/openstack/provider.py
@@ -26,6 +26,7 @@ from nodepool.driver import Provider
26from nodepool.driver.utils import QuotaInformation 26from nodepool.driver.utils import QuotaInformation
27from nodepool.nodeutils import iterate_timeout 27from nodepool.nodeutils import iterate_timeout
28from nodepool.task_manager import TaskManager 28from nodepool.task_manager import TaskManager
29from nodepool import stats
29from nodepool import version 30from nodepool import version
30from nodepool import zk 31from nodepool import zk
31 32
@@ -50,6 +51,10 @@ class OpenStackProvider(Provider):
50 self._taskmanager = None 51 self._taskmanager = None
51 self._current_nodepool_quota = None 52 self._current_nodepool_quota = None
52 self._zk = None 53 self._zk = None
54 self._down_ports = set()
55 self._last_port_cleanup = None
56 self._port_cleanup_interval_secs = 180
57 self._statsd = stats.get_client()
53 58
54 def start(self, zk_conn): 59 def start(self, zk_conn):
55 if self._use_taskmanager: 60 if self._use_taskmanager:
@@ -430,6 +435,21 @@ class OpenStackProvider(Provider):
430 **meta) 435 **meta)
431 return image.id 436 return image.id
432 437
438 def listPorts(self, status=None):
439 '''
440 List known ports.
441
442 :param str status: A valid port status. E.g., 'ACTIVE' or 'DOWN'.
443 '''
444 if status:
445 ports = self._client.list_ports(filters={'status': status})
446 else:
447 ports = self._client.list_ports()
448 return ports
449
450 def deletePort(self, port_id):
451 self._client.delete_port(port_id)
452
433 def listImages(self): 453 def listImages(self):
434 return self._client.list_images() 454 return self._client.list_images()
435 455
@@ -457,7 +477,7 @@ class OpenStackProvider(Provider):
457 self.log.debug('Deleting server %s' % server_id) 477 self.log.debug('Deleting server %s' % server_id)
458 self.deleteServer(server_id) 478 self.deleteServer(server_id)
459 479
460 def cleanupLeakedResources(self): 480 def cleanupLeakedInstances(self):
461 ''' 481 '''
462 Delete any leaked server instances. 482 Delete any leaked server instances.
463 483
@@ -505,6 +525,63 @@ class OpenStackProvider(Provider):
505 node.state = zk.DELETING 525 node.state = zk.DELETING
506 self._zk.storeNode(node) 526 self._zk.storeNode(node)
507 527
528 def filterComputePorts(self, ports):
529 '''
530 Return a list of compute ports (or no device owner).
531
532 We are not interested in ports for routers or DHCP.
533 '''
534 ret = []
535 for p in ports:
536 if p.device_owner is None or p.device_owner.startswith("compute:"):
537 ret.append(p)
538 return ret
539
540 def cleanupLeakedPorts(self):
541 if not self._last_port_cleanup:
542 self._last_port_cleanup = time.monotonic()
543 ports = self.listPorts(status='DOWN')
544 ports = self.filterComputePorts(ports)
545 self._down_ports = set([p.id for p in ports])
546 return
547
548 # Return if not enough time has passed between cleanup
549 last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
550 if last_check_in_secs <= self._port_cleanup_interval_secs:
551 return
552
553 ports = self.listPorts(status='DOWN')
554 ports = self.filterComputePorts(ports)
555 current_set = set([p.id for p in ports])
556 remove_set = current_set & self._down_ports
557
558 removed_count = 0
559 for port_id in remove_set:
560 try:
561 self.deletePort(port_id)
562 except Exception:
563 self.log.exception("Exception deleting port %s in %s:",
564 port_id, self.provider.name)
565 else:
566 removed_count += 1
567 self.log.debug("Removed DOWN port %s in %s",
568 port_id, self.provider.name)
569
570 if self._statsd and removed_count:
571 key = 'nodepool.provider.%s.downPorts' % (self.provider.name)
572 self._statsd.incr(key, removed_count)
573
574 self._last_port_cleanup = time.monotonic()
575
576 # Rely on OpenStack to tell us the down ports rather than doing our
577 # own set adjustment.
578 ports = self.listPorts(status='DOWN')
579 ports = self.filterComputePorts(ports)
580 self._down_ports = set([p.id for p in ports])
581
582 def cleanupLeakedResources(self):
583 self.cleanupLeakedInstances()
584 self.cleanupLeakedPorts()
508 if self.provider.clean_floating_ips: 585 if self.provider.clean_floating_ips:
509 self._client.delete_unattached_floating_ips() 586 self._client.delete_unattached_floating_ips()
510 587
diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py
index 2efea9d..753cd72 100644
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@@ -1935,3 +1935,24 @@ class TestLauncher(tests.DBTestCase):
1935 1935
1936 while self.zk.client.exists(path): 1936 while self.zk.client.exists(path):
1937 time.sleep(.1) 1937 time.sleep(.1)
1938
1939 def test_leaked_port_cleanup(self):
1940 configfile = self.setup_config('node.yaml')
1941 self.useBuilder(configfile)
1942 pool = self.useNodepool(configfile, watermark_sleep=1)
1943 pool.cleanup_interval = 1
1944 pool.start()
1945 self.waitForNodes('fake-label')
1946
1947 manager = pool.getProviderManager('fake-provider')
1948 down_ports = manager.listPorts(status='DOWN')
1949 self.assertEqual(2, len(down_ports))
1950 self.log.debug("Down ports: %s", down_ports)
1951
1952 # Change the port cleanup interval to happen quicker
1953 manager._port_cleanup_interval_secs = 2
1954 while manager.listPorts(status='DOWN'):
1955 time.sleep(1)
1956
1957 self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
1958 value='2', kind='c')
diff --git a/releasenotes/notes/port-cleanup-667d476437f03358.yaml b/releasenotes/notes/port-cleanup-667d476437f03358.yaml
new file mode 100644
index 0000000..e89e375
--- /dev/null
+++ b/releasenotes/notes/port-cleanup-667d476437f03358.yaml
@@ -0,0 +1,7 @@
1---
2features:
3 - |
4 Added a new routine to the OpenStack driver cleanup resources phase that
5 will remove any ports reported to be in the DOWN state. Ports will have
6 to be seen as DOWN for at least three minutes before they will be removed.
7 The number of ports removed will be reported to statsd.
diff --git a/requirements.txt b/requirements.txt
index e65ee3c..f310349 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,9 +6,8 @@ python-daemon>=2.0.4,<2.1.0
6extras 6extras
7statsd>=3.0 7statsd>=3.0
8PrettyTable>=0.6,<0.8 8PrettyTable>=0.6,<0.8
9# openstacksdk 0.20.0 exposes a bug in keystoneauth. The pin can be removed 9# openstacksdk before 0.21.0 has issues with dogpile.cache
10# once keystoneauth1 has made a release 10openstacksdk>=0.21.0
11openstacksdk>=0.17.2,!=0.18.0,!=0.20.0
12diskimage-builder>=2.0.0 11diskimage-builder>=2.0.0
13voluptuous 12voluptuous
14kazoo 13kazoo