summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2019-02-28 17:59:46 +0000
committerGerrit Code Review <review@openstack.org>2019-02-28 17:59:46 +0000
commitb12d0d61645f210ebed9a3930295c191d4c9621d (patch)
treee66efb7e180330eaa2a5450c7a085735d5765f7f
parentec5af360f2b48a4fcdb0e70555a75796857efad9 (diff)
parentcd9264a5b884506f39207dbdb5cb0c02f9a040d0 (diff)
Merge "Fix rare semaphore leak during reconfiguration"
-rw-r--r--tests/fixtures/config/semaphore/git/common-config/zuul-remove-job.yaml110
-rw-r--r--tests/unit/test_scheduler.py90
-rw-r--r--zuul/manager/__init__.py2
-rw-r--r--zuul/scheduler.py8
4 files changed, 207 insertions, 3 deletions
diff --git a/tests/fixtures/config/semaphore/git/common-config/zuul-remove-job.yaml b/tests/fixtures/config/semaphore/git/common-config/zuul-remove-job.yaml
new file mode 100644
index 0000000..6630238
--- /dev/null
+++ b/tests/fixtures/config/semaphore/git/common-config/zuul-remove-job.yaml
@@ -0,0 +1,110 @@
1- pipeline:
2 name: check
3 manager: independent
4 trigger:
5 gerrit:
6 - event: patchset-created
7 success:
8 gerrit:
9 Verified: 1
10 failure:
11 gerrit:
12 Verified: -1
13
14- semaphore:
15 name: test-semaphore-two
16 max: 2
17
18- job:
19 name: base
20 parent: null
21 nodeset:
22 nodes:
23 - name: controller
24 label: label1
25
26- job:
27 name: project-test1
28 run: playbooks/project-test1.yaml
29
30- job:
31 name: semaphore-one-test1
32 semaphore: test-semaphore
33 run: playbooks/semaphore-one-test1.yaml
34
35- job:
36 name: semaphore-one-test2
37 semaphore: test-semaphore
38 run: playbooks/semaphore-one-test2.yaml
39
40- job:
41 name: semaphore-two-test1
42 semaphore: test-semaphore-two
43 run: playbooks/semaphore-two-test1.yaml
44
45- job:
46 name: semaphore-two-test2
47 semaphore: test-semaphore-two
48 run: playbooks/semaphore-two-test2.yaml
49
50- job:
51 name: semaphore-one-test3
52 semaphore: test-semaphore
53 run: playbooks/semaphore-two-test1.yaml
54 nodeset:
55 nodes:
56 - name: controller
57 label: label1
58
59- job:
60 name: semaphore-one-test1-resources-first
61 semaphore:
62 name: test-semaphore
63 resources-first: True
64 run: playbooks/semaphore-one-test1.yaml
65
66- job:
67 name: semaphore-one-test2-resources-first
68 semaphore:
69 name: test-semaphore
70 resources-first: True
71 run: playbooks/semaphore-one-test1.yaml
72
73- project:
74 name: org/project
75 check:
76 jobs:
77 - project-test1
78 # This is the difference to zuul.yaml that is used in
79 # test_semaphore_reconfigure_job_removal:
80 # - semaphore-one-test1
81 # - semaphore-one-test2
82
83
84- project:
85 name: org/project1
86 check:
87 jobs:
88 - project-test1
89 - semaphore-two-test1
90 - semaphore-two-test2
91
92- project:
93 name: org/project2
94 check:
95 jobs:
96 - semaphore-one-test3
97
98- project:
99 name: org/project3
100 check:
101 jobs:
102 - project-test1
103 - semaphore-one-test1-resources-first
104 - semaphore-one-test2-resources-first
105
106- project:
107 name: org/project4
108 check:
109 jobs:
110 - semaphore-one-test1-resources-first
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index 690971e..3611cde 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -6584,6 +6584,96 @@ class TestSemaphore(ZuulTestCase):
6584 self.assertFalse('test-semaphore' in 6584 self.assertFalse('test-semaphore' in
6585 tenant.semaphore_handler.semaphores) 6585 tenant.semaphore_handler.semaphores)
6586 6586
6587 def test_semaphore_reconfigure_job_removal(self):
6588 "Test job removal during reconfiguration with semaphores"
6589 self.executor_server.hold_jobs_in_build = True
6590 tenant = self.sched.abide.tenants.get('tenant-one')
6591
6592 A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
6593 self.assertFalse('test-semaphore' in
6594 tenant.semaphore_handler.semaphores)
6595
6596 self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
6597 self.waitUntilSettled()
6598
6599 self.assertTrue('test-semaphore' in
6600 tenant.semaphore_handler.semaphores)
6601
6602 self.commitConfigUpdate(
6603 'common-config',
6604 'config/semaphore/git/common-config/zuul-remove-job.yaml')
6605 self.sched.reconfigure(self.config)
6606 self.waitUntilSettled()
6607
6608 # Release job project-test1 which should be the only job left
6609 self.executor_server.release('project-test1')
6610 self.waitUntilSettled()
6611
6612 # The check pipeline should be empty
6613 tenant = self.sched.abide.tenants.get('tenant-one')
6614 check_pipeline = tenant.layout.pipelines['check']
6615 items = check_pipeline.getAllItems()
6616 self.assertEqual(len(items), 0)
6617
6618 # The semaphore should be released
6619 self.assertFalse('test-semaphore' in
6620 tenant.semaphore_handler.semaphores)
6621
6622 self.executor_server.hold_jobs_in_build = False
6623 self.executor_server.release()
6624 self.waitUntilSettled()
6625
6626 def test_semaphore_reconfigure_job_removal_pending_node_request(self):
6627 """
6628 Test job removal during reconfiguration with semaphores and pending
6629 node request.
6630 """
6631 self.executor_server.hold_jobs_in_build = True
6632
6633 # Pause nodepool so we can block the job in node request state during
6634 # reconfiguration.
6635 self.fake_nodepool.pause()
6636
6637 tenant = self.sched.abide.tenants.get('tenant-one')
6638
6639 A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
6640 self.assertFalse('test-semaphore' in
6641 tenant.semaphore_handler.semaphores)
6642
6643 self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
6644 self.waitUntilSettled()
6645
6646 self.assertTrue('test-semaphore' in
6647 tenant.semaphore_handler.semaphores)
6648
6649 self.commitConfigUpdate(
6650 'common-config',
6651 'config/semaphore/git/common-config/zuul-remove-job.yaml')
6652 self.sched.reconfigure(self.config)
6653 self.waitUntilSettled()
6654
6655 # Now we can unpause nodepool
6656 self.fake_nodepool.unpause()
6657 self.waitUntilSettled()
6658
6659 # Release job project-test1 which should be the only job left
6660 self.executor_server.release('project-test1')
6661 self.waitUntilSettled()
6662
6663 # The check pipeline should be empty
6664 tenant = self.sched.abide.tenants.get('tenant-one')
6665 check_pipeline = tenant.layout.pipelines['check']
6666 items = check_pipeline.getAllItems()
6667 self.assertEqual(len(items), 0)
6668
6669 # The semaphore should be released
6670 self.assertFalse('test-semaphore' in
6671 tenant.semaphore_handler.semaphores)
6672
6673 self.executor_server.hold_jobs_in_build = False
6674 self.executor_server.release()
6675 self.waitUntilSettled()
6676
6587 6677
6588class TestSemaphoreMultiTenant(ZuulTestCase): 6678class TestSemaphoreMultiTenant(ZuulTestCase):
6589 tenant_config_file = 'config/multi-tenant-semaphore/main.yaml' 6679 tenant_config_file = 'config/multi-tenant-semaphore/main.yaml'
diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 6b5abc1..d549691 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py
@@ -836,7 +836,7 @@ class PipelineManager(object):
836 self.log.debug("Item %s status is now:\n %s" % 836 self.log.debug("Item %s status is now:\n %s" %
837 (item, item.formatStatus())) 837 (item, item.formatStatus()))
838 838
839 if build.retry: 839 if build.retry and build.build_set.getJobNodeSet(build.job.name):
840 build.build_set.removeJobNodeSet(build.job.name) 840 build.build_set.removeJobNodeSet(build.job.name)
841 841
842 self._resumeBuilds(build.build_set) 842 self._resumeBuilds(build.build_set)
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 0a4bcee..6f82568 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -830,6 +830,8 @@ class Scheduler(threading.Thread):
830 item.current_build_set.node_requests.items(): 830 item.current_build_set.node_requests.items():
831 requests_to_cancel.append( 831 requests_to_cancel.append(
832 (item.current_build_set, request)) 832 (item.current_build_set, request))
833
834 semaphores_to_release = []
833 for build in builds_to_cancel: 835 for build in builds_to_cancel:
834 self.log.info( 836 self.log.info(
835 "Canceling build %s during reconfiguration" % (build,)) 837 "Canceling build %s during reconfiguration" % (build,))
@@ -848,14 +850,16 @@ class Scheduler(threading.Thread):
848 self.log.exception( 850 self.log.exception(
849 "Exception while removing nodeset from build %s " 851 "Exception while removing nodeset from build %s "
850 "for change %s" % (build, build.build_set.item.change)) 852 "for change %s" % (build, build.build_set.item.change))
851 tenant.semaphore_handler.release( 853 semaphores_to_release.append((build.build_set.item, build.job))
852 build.build_set.item, build.job)
853 for build_set, request in requests_to_cancel: 854 for build_set, request in requests_to_cancel:
854 self.log.info( 855 self.log.info(
855 "Canceling node request %s during reconfiguration", 856 "Canceling node request %s during reconfiguration",
856 request) 857 request)
857 self.nodepool.cancelRequest(request) 858 self.nodepool.cancelRequest(request)
858 build_set.removeJobNodeRequest(request.job.name) 859 build_set.removeJobNodeRequest(request.job.name)
860 semaphores_to_release.append((build_set.item, request.job))
861 for item, job in semaphores_to_release:
862 tenant.semaphore_handler.release(item, job)
859 863
860 def _reconfigureTenant(self, tenant): 864 def _reconfigureTenant(self, tenant):
861 # This is called from _doReconfigureEvent while holding the 865 # This is called from _doReconfigureEvent while holding the