Revert "Revert "Cleanup down ports""

This reverts commit 7e1b8a7261.

openstacksdk >=0.19.0 fixes the filtering problems leading to all
ports being deleted. However openstacksdk <0.21.0 has problems with
dogpile.cache so use 0.21.0 as a minimum.

Change-Id: Id642d074cbb645ced5342dda4a1c89987c91a8fc
This commit is contained in:
Ian Wienand 2018-10-31 10:25:29 +11:00 committed by Tobias Henkel
parent e459ffa0fd
commit 0cf8144e8c
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
6 changed files with 135 additions and 4 deletions

View File

@ -329,6 +329,12 @@ Nodepool launcher
* ready
* used
.. zuul:stat:: nodepool.provider.<provider>.downPorts
:type: counter
Number of ports in the DOWN state that have been removed automatically
in the cleanup resources phase of the OpenStack driver.
.. zuul:stat:: nodepool.provider.<provider>.nodes.<state>
:type: gauge

View File

@ -32,6 +32,7 @@ class Dummy(object):
INSTANCE = 'Instance'
FLAVOR = 'Flavor'
LOCATION = 'Server.Location'
PORT = 'Port'
def __init__(self, kind, **kw):
self.__kind = kind
@ -105,6 +106,12 @@ class FakeOpenStackCloud(object):
self._server_list = []
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
self._down_ports = [
Dummy(Dummy.PORT, id='1a', status='DOWN',
device_owner="compute:nova"),
Dummy(Dummy.PORT, id='2b', status='DOWN',
device_owner=None),
]
def _get(self, name_or_id, instance_list):
self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
@ -286,6 +293,20 @@ class FakeOpenStackCloud(object):
total_ram_used=8192 * len(self._server_list)
)
def list_ports(self, filters=None):
if filters and filters.get('status') == 'DOWN':
return self._down_ports
return []
def delete_port(self, port_id):
tmp_ports = []
for port in self._down_ports:
if port.id != port_id:
tmp_ports.append(port)
else:
self.log.debug("Deleted port ID: %s", port_id)
self._down_ports = tmp_ports
class FakeUploadFailCloud(FakeOpenStackCloud):
log = logging.getLogger("nodepool.FakeUploadFailCloud")

View File

@ -26,6 +26,7 @@ from nodepool.driver import Provider
from nodepool.driver.utils import QuotaInformation
from nodepool.nodeutils import iterate_timeout
from nodepool.task_manager import TaskManager
from nodepool import stats
from nodepool import version
from nodepool import zk
@ -50,6 +51,10 @@ class OpenStackProvider(Provider):
self._taskmanager = None
self._current_nodepool_quota = None
self._zk = None
self._down_ports = set()
self._last_port_cleanup = None
self._port_cleanup_interval_secs = 180
self._statsd = stats.get_client()
def start(self, zk_conn):
if self._use_taskmanager:
@ -428,6 +433,21 @@ class OpenStackProvider(Provider):
**meta)
return image.id
def listPorts(self, status=None):
'''
List known ports.
:param str status: A valid port status. E.g., 'ACTIVE' or 'DOWN'.
'''
if status:
ports = self._client.list_ports(filters={'status': status})
else:
ports = self._client.list_ports()
return ports
def deletePort(self, port_id):
self._client.delete_port(port_id)
def listImages(self):
return self._client.list_images()
@ -455,7 +475,7 @@ class OpenStackProvider(Provider):
self.log.debug('Deleting server %s' % server_id)
self.deleteServer(server_id)
def cleanupLeakedResources(self):
def cleanupLeakedInstances(self):
'''
Delete any leaked server instances.
@ -503,6 +523,63 @@ class OpenStackProvider(Provider):
node.state = zk.DELETING
self._zk.storeNode(node)
def filterComputePorts(self, ports):
'''
Return a list of compute ports (or no device owner).
We are not interested in ports for routers or DHCP.
'''
ret = []
for p in ports:
if p.device_owner is None or p.device_owner.startswith("compute:"):
ret.append(p)
return ret
def cleanupLeakedPorts(self):
if not self._last_port_cleanup:
self._last_port_cleanup = time.monotonic()
ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
self._down_ports = set([p.id for p in ports])
return
# Return if not enough time has passed between cleanup
last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
if last_check_in_secs <= self._port_cleanup_interval_secs:
return
ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
current_set = set([p.id for p in ports])
remove_set = current_set & self._down_ports
removed_count = 0
for port_id in remove_set:
try:
self.deletePort(port_id)
except Exception:
self.log.exception("Exception deleting port %s in %s:",
port_id, self.provider.name)
else:
removed_count += 1
self.log.debug("Removed DOWN port %s in %s",
port_id, self.provider.name)
if self._statsd and removed_count:
key = 'nodepool.provider.%s.downPorts' % (self.provider.name)
self._statsd.incr(key, removed_count)
self._last_port_cleanup = time.monotonic()
# Rely on OpenStack to tell us the down ports rather than doing our
# own set adjustment.
ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
self._down_ports = set([p.id for p in ports])
def cleanupLeakedResources(self):
self.cleanupLeakedInstances()
self.cleanupLeakedPorts()
if self.provider.clean_floating_ips:
self._client.delete_unattached_floating_ips()

View File

@ -1935,3 +1935,24 @@ class TestLauncher(tests.DBTestCase):
while self.zk.client.exists(path):
time.sleep(.1)
def test_leaked_port_cleanup(self):
configfile = self.setup_config('node.yaml')
self.useBuilder(configfile)
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.cleanup_interval = 1
pool.start()
self.waitForNodes('fake-label')
manager = pool.getProviderManager('fake-provider')
down_ports = manager.listPorts(status='DOWN')
self.assertEqual(2, len(down_ports))
self.log.debug("Down ports: %s", down_ports)
# Change the port cleanup interval to happen quicker
manager._port_cleanup_interval_secs = 2
while manager.listPorts(status='DOWN'):
time.sleep(1)
self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
value='2', kind='c')

View File

@ -0,0 +1,7 @@
---
features:
- |
Added a new routine to the OpenStack driver cleanup resources phase that
will remove any ports reported to be in the DOWN state. Ports will have
to be seen as DOWN for at least three minutes before they will be removed.
The number of ports removed will be reported to statsd.

View File

@ -6,9 +6,8 @@ python-daemon>=2.0.4,<2.1.0
extras
statsd>=3.0
PrettyTable>=0.6,<0.8
# openstacksdk 0.20.0 exposes a bug in keystoneauth. The pin can be removed
# once keystoneauth1 has made a release
openstacksdk>=0.17.2,!=0.18.0,!=0.20.0
# openstacksdk before 0.21.0 has issues with dogpile.cache
openstacksdk>=0.21.0
diskimage-builder>=2.0.0
voluptuous
kazoo