summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2018-12-04 19:20:05 +0000
committerGerrit Code Review <review@openstack.org>2018-12-04 19:20:05 +0000
commit1b5d416f36dc027474dcc3eccfd5665e30884cc7 (patch)
tree8095e2c72d748c64fcde52a94849ba10da582e73
parentad541a3209cc1057f78ef8f31d321061d5532e62 (diff)
parentb3053779a676b2deb23eaf2df6832d3491932bf8 (diff)
Merge "Fix race when deleting Node znodes"
-rwxr-xr-xnodepool/launcher.py21
-rwxr-xr-xnodepool/zk.py12
2 files changed, 32 insertions, 1 deletions
diff --git a/nodepool/launcher.py b/nodepool/launcher.py
index 6f2c39c..e881ec2 100755
--- a/nodepool/launcher.py
+++ b/nodepool/launcher.py
@@ -704,6 +704,27 @@ class DeletedNodeWorker(BaseCleanupWorker):
704 except exceptions.ZKLockException: 704 except exceptions.ZKLockException:
705 continue 705 continue
706 706
707 if (node.state == zk.DELETED or
708 node.provider is None):
709 # The node has been deleted out from under us --
710 # we only obtained the lock because in the
711 # recursive delete, the lock is deleted first and
712 # we locked the node between the time of the lock
713 # delete and the node delete. We need to clean up
714 # the mess.
715 try:
716 # This should delete the lock as well
717 zk_conn.deleteNode(node)
718 except Exception:
719 self.log.exception(
720 "Error deleting already deleted znode:")
721 try:
722 zk_conn.unlockNode(node)
723 except Exception:
724 self.log.exception(
725 "Error unlocking already deleted znode:")
726 continue
727
707 # Double check the state now that we have a lock since it 728 # Double check the state now that we have a lock since it
708 # may have changed on us. 729 # may have changed on us.
709 if node.state not in cleanup_states: 730 if node.state not in cleanup_states:
diff --git a/nodepool/zk.py b/nodepool/zk.py
index ca02502..cfdb6df 100755
--- a/nodepool/zk.py
+++ b/nodepool/zk.py
@@ -57,6 +57,8 @@ INIT = 'init'
57# Aborted due to a transient error like overquota that should not count as a 57# Aborted due to a transient error like overquota that should not count as a
58# failed launch attempt 58# failed launch attempt
59ABORTED = 'aborted' 59ABORTED = 'aborted'
60# The node has actually been deleted and the Znode should be deleted
61DELETED = 'deleted'
60 62
61 63
62# NOTE(Shrews): Importing this from nodepool.config causes an import error 64# NOTE(Shrews): Importing this from nodepool.config causes an import error
@@ -503,7 +505,8 @@ class Node(BaseModel):
503 Class representing a launched node. 505 Class representing a launched node.
504 ''' 506 '''
505 VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED, 507 VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED,
506 HOLD, DELETING, FAILED, INIT, ABORTED]) 508 HOLD, DELETING, FAILED, INIT, ABORTED,
509 DELETED])
507 510
508 def __init__(self, id=None): 511 def __init__(self, id=None):
509 super(Node, self).__init__(id) 512 super(Node, self).__init__(id)
@@ -1889,6 +1892,13 @@ class ZooKeeper(object):
1889 return 1892 return
1890 1893
1891 path = self._nodePath(node.id) 1894 path = self._nodePath(node.id)
1895
1896 # Set the node state to deleted before we start deleting
1897 # anything so that we can detect a race condition where the
1898 # lock is removed before the node deletion occurs.
1899 node.state = DELETED
1900 self.client.set(path, node.serialize())
1901
1892 try: 1902 try:
1893 self.client.delete(path, recursive=True) 1903 self.client.delete(path, recursive=True)
1894 except kze.NoNodeError: 1904 except kze.NoNodeError: