summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames E. Blair <jeblair@redhat.com>2018-12-04 08:14:43 -0800
committerJames E. Blair <jeblair@redhat.com>2018-12-04 09:49:40 -0800
commitb3053779a676b2deb23eaf2df6832d3491932bf8 (patch)
tree490e0037d37a704ecac5502d3109ddc3e237007a
parent50302e7255b27ddc6a356ff8c92111ecf569aab0 (diff)
Fix race when deleting Node znodes
There is a race where: Thread A Thread B locks the node deletes the instance deletes the lock locks the node deletes the node data loads empty data from the node Thread A only deletes the node data because during it's recursive delete, a new child node (the lock from B) has appeared. Thread B proceeds with invalid data and errors out. We can detect this condition because the node has no data. It may be theoretically possible to lock the node and load the data before the node data are deleted as well, so to protect against this case, we set a new node state, "deleted" before we start deleting anything. If thread B encounters either of those two conditions (no data, or a "deleted" state), we know we've hit this race and can safely attempt to recursively delete the node again. Change-Id: Iea5558f9eb471cf1096120b06c098f8f41ab59d9
Notes
Notes (review): Code-Review+2: Tobias Henkel <tobias.henkel@bmw.de> Code-Review+2: David Shrewsbury <shrewsbury.dave@gmail.com> Workflow+1: David Shrewsbury <shrewsbury.dave@gmail.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Tue, 04 Dec 2018 19:20:05 +0000 Reviewed-on: https://review.openstack.org/622403 Project: openstack-infra/nodepool Branch: refs/heads/master
-rwxr-xr-xnodepool/launcher.py21
-rwxr-xr-xnodepool/zk.py12
2 files changed, 32 insertions, 1 deletions
diff --git a/nodepool/launcher.py b/nodepool/launcher.py
index 6f2c39c..e881ec2 100755
--- a/nodepool/launcher.py
+++ b/nodepool/launcher.py
@@ -704,6 +704,27 @@ class DeletedNodeWorker(BaseCleanupWorker):
704 except exceptions.ZKLockException: 704 except exceptions.ZKLockException:
705 continue 705 continue
706 706
707 if (node.state == zk.DELETED or
708 node.provider is None):
709 # The node has been deleted out from under us --
710 # we only obtained the lock because in the
711 # recursive delete, the lock is deleted first and
712 # we locked the node between the time of the lock
713 # delete and the node delete. We need to clean up
714 # the mess.
715 try:
716 # This should delete the lock as well
717 zk_conn.deleteNode(node)
718 except Exception:
719 self.log.exception(
720 "Error deleting already deleted znode:")
721 try:
722 zk_conn.unlockNode(node)
723 except Exception:
724 self.log.exception(
725 "Error unlocking already deleted znode:")
726 continue
727
707 # Double check the state now that we have a lock since it 728 # Double check the state now that we have a lock since it
708 # may have changed on us. 729 # may have changed on us.
709 if node.state not in cleanup_states: 730 if node.state not in cleanup_states:
diff --git a/nodepool/zk.py b/nodepool/zk.py
index ca02502..cfdb6df 100755
--- a/nodepool/zk.py
+++ b/nodepool/zk.py
@@ -57,6 +57,8 @@ INIT = 'init'
57# Aborted due to a transient error like overquota that should not count as a 57# Aborted due to a transient error like overquota that should not count as a
58# failed launch attempt 58# failed launch attempt
59ABORTED = 'aborted' 59ABORTED = 'aborted'
60# The node has actually been deleted and the Znode should be deleted
61DELETED = 'deleted'
60 62
61 63
62# NOTE(Shrews): Importing this from nodepool.config causes an import error 64# NOTE(Shrews): Importing this from nodepool.config causes an import error
@@ -503,7 +505,8 @@ class Node(BaseModel):
503 Class representing a launched node. 505 Class representing a launched node.
504 ''' 506 '''
505 VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED, 507 VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED,
506 HOLD, DELETING, FAILED, INIT, ABORTED]) 508 HOLD, DELETING, FAILED, INIT, ABORTED,
509 DELETED])
507 510
508 def __init__(self, id=None): 511 def __init__(self, id=None):
509 super(Node, self).__init__(id) 512 super(Node, self).__init__(id)
@@ -1889,6 +1892,13 @@ class ZooKeeper(object):
1889 return 1892 return
1890 1893
1891 path = self._nodePath(node.id) 1894 path = self._nodePath(node.id)
1895
1896 # Set the node state to deleted before we start deleting
1897 # anything so that we can detect a race condition where the
1898 # lock is removed before the node deletion occurs.
1899 node.state = DELETED
1900 self.client.set(path, node.serialize())
1901
1892 try: 1902 try:
1893 self.client.delete(path, recursive=True) 1903 self.client.delete(path, recursive=True)
1894 except kze.NoNodeError: 1904 except kze.NoNodeError: