Merge "Revert "Revert "Cleanup down ports"""

This commit is contained in:
Zuul 2019-01-24 16:15:33 +00:00 committed by Gerrit Code Review
commit 78d2476769
6 changed files with 135 additions and 4 deletions

View File

@ -329,6 +329,12 @@ Nodepool launcher
* ready
* used
.. zuul:stat:: nodepool.provider.<provider>.downPorts
:type: counter
Number of ports in the DOWN state that have been removed automatically
in the cleanup resources phase of the OpenStack driver.
.. zuul:stat:: nodepool.provider.<provider>.nodes.<state>
:type: gauge

View File

@ -32,6 +32,7 @@ class Dummy(object):
INSTANCE = 'Instance'
FLAVOR = 'Flavor'
LOCATION = 'Server.Location'
PORT = 'Port'
def __init__(self, kind, **kw):
self.__kind = kind
@ -105,6 +106,12 @@ class FakeOpenStackCloud(object):
self._server_list = []
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
self._down_ports = [
Dummy(Dummy.PORT, id='1a', status='DOWN',
device_owner="compute:nova"),
Dummy(Dummy.PORT, id='2b', status='DOWN',
device_owner=None),
]
def _get(self, name_or_id, instance_list):
self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
@ -286,6 +293,20 @@ class FakeOpenStackCloud(object):
total_ram_used=8192 * len(self._server_list)
)
def list_ports(self, filters=None):
if filters and filters.get('status') == 'DOWN':
return self._down_ports
return []
def delete_port(self, port_id):
tmp_ports = []
for port in self._down_ports:
if port.id != port_id:
tmp_ports.append(port)
else:
self.log.debug("Deleted port ID: %s", port_id)
self._down_ports = tmp_ports
class FakeUploadFailCloud(FakeOpenStackCloud):
log = logging.getLogger("nodepool.FakeUploadFailCloud")

View File

@ -26,6 +26,7 @@ from nodepool.driver import Provider
from nodepool.driver.utils import QuotaInformation
from nodepool.nodeutils import iterate_timeout
from nodepool.task_manager import TaskManager
from nodepool import stats
from nodepool import version
from nodepool import zk
@ -50,6 +51,10 @@ class OpenStackProvider(Provider):
self._taskmanager = None
self._current_nodepool_quota = None
self._zk = None
self._down_ports = set()
self._last_port_cleanup = None
self._port_cleanup_interval_secs = 180
self._statsd = stats.get_client()
def start(self, zk_conn):
if self._use_taskmanager:
@ -430,6 +435,21 @@ class OpenStackProvider(Provider):
**meta)
return image.id
def listPorts(self, status=None):
'''
List known ports.
:param str status: A valid port status. E.g., 'ACTIVE' or 'DOWN'.
'''
if status:
ports = self._client.list_ports(filters={'status': status})
else:
ports = self._client.list_ports()
return ports
def deletePort(self, port_id):
self._client.delete_port(port_id)
def listImages(self):
return self._client.list_images()
@ -457,7 +477,7 @@ class OpenStackProvider(Provider):
self.log.debug('Deleting server %s' % server_id)
self.deleteServer(server_id)
def cleanupLeakedResources(self):
def cleanupLeakedInstances(self):
'''
Delete any leaked server instances.
@ -505,6 +525,63 @@ class OpenStackProvider(Provider):
node.state = zk.DELETING
self._zk.storeNode(node)
def filterComputePorts(self, ports):
'''
Return a list of compute ports (or no device owner).
We are not interested in ports for routers or DHCP.
'''
ret = []
for p in ports:
if p.device_owner is None or p.device_owner.startswith("compute:"):
ret.append(p)
return ret
def cleanupLeakedPorts(self):
if not self._last_port_cleanup:
self._last_port_cleanup = time.monotonic()
ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
self._down_ports = set([p.id for p in ports])
return
# Return if not enough time has passed between cleanup
last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
if last_check_in_secs <= self._port_cleanup_interval_secs:
return
ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
current_set = set([p.id for p in ports])
remove_set = current_set & self._down_ports
removed_count = 0
for port_id in remove_set:
try:
self.deletePort(port_id)
except Exception:
self.log.exception("Exception deleting port %s in %s:",
port_id, self.provider.name)
else:
removed_count += 1
self.log.debug("Removed DOWN port %s in %s",
port_id, self.provider.name)
if self._statsd and removed_count:
key = 'nodepool.provider.%s.downPorts' % (self.provider.name)
self._statsd.incr(key, removed_count)
self._last_port_cleanup = time.monotonic()
# Rely on OpenStack to tell us the down ports rather than doing our
# own set adjustment.
ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
self._down_ports = set([p.id for p in ports])
def cleanupLeakedResources(self):
self.cleanupLeakedInstances()
self.cleanupLeakedPorts()
if self.provider.clean_floating_ips:
self._client.delete_unattached_floating_ips()

View File

@ -1935,3 +1935,24 @@ class TestLauncher(tests.DBTestCase):
while self.zk.client.exists(path):
time.sleep(.1)
def test_leaked_port_cleanup(self):
configfile = self.setup_config('node.yaml')
self.useBuilder(configfile)
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.cleanup_interval = 1
pool.start()
self.waitForNodes('fake-label')
manager = pool.getProviderManager('fake-provider')
down_ports = manager.listPorts(status='DOWN')
self.assertEqual(2, len(down_ports))
self.log.debug("Down ports: %s", down_ports)
# Change the port cleanup interval to happen quicker
manager._port_cleanup_interval_secs = 2
while manager.listPorts(status='DOWN'):
time.sleep(1)
self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
value='2', kind='c')

View File

@ -0,0 +1,7 @@
---
features:
- |
Added a new routine to the OpenStack driver cleanup resources phase that
will remove any ports reported to be in the DOWN state. Ports will have
to be seen as DOWN for at least three minutes before they will be removed.
The number of ports removed will be reported to statsd.

View File

@ -6,9 +6,8 @@ python-daemon>=2.0.4,<2.1.0
extras
statsd>=3.0
PrettyTable>=0.6,<0.8
# openstacksdk 0.20.0 exposes a bug in keystoneauth. The pin can be removed
# once keystoneauth1 has made a release
openstacksdk>=0.17.2,!=0.18.0,!=0.20.0
# openstacksdk before 0.21.0 has issues with dogpile.cache
openstacksdk>=0.21.0
diskimage-builder>=2.0.0
voluptuous
kazoo