Merge "Amazon EC2 driver"

This commit is contained in:
Zuul 2019-02-04 16:13:05 +00:00 committed by Gerrit Code Review
commit 7b640f7f48
11 changed files with 788 additions and 13 deletions

View File

@ -0,0 +1,27 @@
# Copyright 2018 Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
#
# See the License for the specific language governing permissions and
# limitations under the License.
from nodepool.driver import Driver
from nodepool.driver.aws.config import AwsProviderConfig
from nodepool.driver.aws.provider import AwsProvider
class AwsDriver(Driver):
def getProviderConfig(self, provider):
return AwsProviderConfig(self, provider)
def getProvider(self, provider_config, use_taskmanager):
return AwsProvider(provider_config, use_taskmanager)

View File

@ -0,0 +1,285 @@
# Copyright 2018 Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
#
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import voluptuous as v
from nodepool.driver import ConfigPool
from nodepool.driver import ConfigValue
from nodepool.driver import ProviderConfig
class ProviderCloudImage(ConfigValue):
def __init__(self):
self.name = None
self.image_id = None
self.image_name = None
self.username = None
self.connection_type = None
self.connection_port = None
def __eq__(self, other):
if isinstance(other, ProviderCloudImage):
return (self.name == other.name
and self.image_id == other.image_id
and self.image_name == other.image_name
and self.username == other.username
and self.connection_type == other.connection_type
and self.connection_port == other.connection_port)
return False
def __repr__(self):
return "<ProviderCloudImage %s>" % self.name
@property
def external_name(self):
'''Human readable version of external.'''
return self.image_id or self.image_name or self.name
class ProviderLabel(ConfigValue):
def __init__(self):
self.name = None
self.cloud_image = None
self.flavor_name = None
self.key_name = None
self.volume_size = None
self.volume_type = None
# The ProviderPool object that owns this label.
self.pool = None
def __eq__(self, other):
if isinstance(other, ProviderLabel):
# NOTE(Shrews): We intentionally do not compare 'pool' here
# since this causes recursive checks with ProviderPool.
return (other.name == self.name
and other.cloud_image == self.cloud_image
and other.flavor_name == self.flavor_name
and other.key_name == self.key_name
and other.volume_size == self.volume_size
and other.volume_type == self.volume_type)
return False
def __repr__(self):
return "<ProviderLabel %s>" % self.name
class ProviderPool(ConfigPool):
def __init__(self):
self.name = None
self.max_cores = None
self.max_ram = None
self.ignore_provider_quota = False
self.availability_zone = None
self.subnet_id = None
self.security_group_id = None
self.host_key_checking = True
self.labels = None
# The ProviderConfig object that owns this pool.
self.provider = None
# Initialize base class attributes
super().__init__()
def load(self, pool_config, full_config, provider):
super().load(pool_config)
self.name = pool_config['name']
self.provider = provider
self.max_cores = pool_config.get('max-cores', math.inf)
self.max_ram = pool_config.get('max-ram', math.inf)
self.ignore_provider_quota = pool_config.get(
'ignore-provider-quota', False)
self.availability_zone = pool_config.get('availability-zone')
self.security_group_id = pool_config.get('security-group-id')
self.subnet_id = pool_config.get('subnet-id')
self.host_key_checking = bool(
pool_config.get('host-key-checking', True))
for label in pool_config.get('labels', []):
pl = ProviderLabel()
pl.name = label['name']
pl.pool = self
self.labels[pl.name] = pl
cloud_image_name = label.get('cloud-image', None)
if cloud_image_name:
cloud_image = self.provider.cloud_images.get(
cloud_image_name, None)
if not cloud_image:
raise ValueError(
"cloud-image %s does not exist in provider %s"
" but is referenced in label %s" %
(cloud_image_name, self.name, pl.name))
else:
cloud_image = None
pl.cloud_image = cloud_image
pl.flavor_name = label['flavor-name']
pl.key_name = label['key-name']
pl.volume_type = label.get('volume-type')
pl.volume_size = label.get('volume-size')
full_config.labels[label['name']].pools.append(self)
def __eq__(self, other):
if isinstance(other, ProviderPool):
# NOTE(Shrews): We intentionally do not compare 'provider' here
# since this causes recursive checks with OpenStackProviderConfig.
return (super().__eq__(other)
and other.name == self.name
and other.max_cores == self.max_cores
and other.max_ram == self.max_ram
and other.ignore_provider_quota == (
self.ignore_provider_quota)
and other.availability_zone == self.availability_zone
and other.subnet_id == self.subnet_id
and other.security_group_id == self.security_group_id
and other.host_key_checking == self.host_key_checking
and other.labels == self.labels)
return False
def __repr__(self):
return "<ProviderPool %s>" % self.name
class AwsProviderConfig(ProviderConfig):
def __init__(self, driver, provider):
self.driver_object = driver
self.__pools = {}
self.profile_name = None
self.region_name = None
self.rate = None
self.boot_timeout = None
self.launch_retries = None
self.launch_timeout = None
self.cloud_images = {}
self.hostname_format = None
self.image_name_format = None
super().__init__(provider)
def __eq__(self, other):
if isinstance(other, AwsProviderConfig):
return (super().__eq__(other)
and other.profile_name == self.profile_name
and other.region_name == self.region_name
and other.pools == self.pools
and other.rate == self.rate
and other.boot_timeout == self.boot_timeout
and other.launch_retries == self.launch_retries
and other.launch_timeout == self.launch_timeout
and other.cloud_images == self.cloud_images)
return False
@property
def pools(self):
return self.__pools
@property
def manage_images(self):
return True
@staticmethod
def reset():
pass
def load(self, config):
self.profile_name = self.provider.get('profile-name')
self.region_name = self.provider.get('region-name')
self.rate = float(self.provider.get('rate', 1.0))
self.boot_timeout = self.provider.get('boot-timeout', 60)
self.launch_retries = self.provider.get('launch-retries', 3)
self.launch_timeout = self.provider.get('launch-timeout', 3600)
self.hostname_format = self.provider.get(
'hostname-format',
'{label.name}-{provider.name}-{node.id}'
)
self.image_name_format = self.provider.get(
'image-name-format',
'{image_name}-{timestamp}'
)
default_port_mapping = {
'ssh': 22,
'winrm': 5986,
}
# TODO: diskimages
for image in self.provider.get('cloud-images', []):
i = ProviderCloudImage()
i.name = image['name']
i.image_id = image.get('image-id', None)
i.image_name = image.get('image-name', None)
i.username = image.get('username', None)
i.connection_type = image.get('connection-type', 'ssh')
i.connection_port = image.get(
'connection-port',
default_port_mapping.get(i.connection_type, 22))
self.cloud_images[i.name] = i
for pool in self.provider.get('pools', []):
pp = ProviderPool()
pp.load(pool, config, self)
self.pools[pp.name] = pp
def getSchema(self):
pool_label = {
v.Required('name'): str,
v.Exclusive('cloud-image', 'label-image'): str,
v.Required('flavor-name'): str,
v.Required('key-name'): str,
'volume-type': str,
'volume-size': int
}
pool = ConfigPool.getCommonSchemaDict()
pool.update({
v.Required('name'): str,
v.Required('labels'): [pool_label],
'max-cores': int,
'max-ram': int,
'availability-zone': str,
'security-group-id': str,
'subnet-id': str,
})
provider_cloud_images = {
'name': str,
'connection-type': str,
'connection-port': int,
v.Exclusive('image-id', 'cloud-image-name-or-id'): str,
v.Exclusive('image-name', 'cloud-image-name-or-id'): str,
'username': str,
}
provider = ProviderConfig.getCommonSchemaDict()
provider.update({
v.Required('pools'): [pool],
v.Required('region-name'): str,
'profile-name': str,
'cloud-images': [provider_cloud_images],
'rate': v.Coerce(float),
'hostname-format': str,
'image-name-format': str,
'boot-timeout': int,
'launch-timeout': int,
'launch-retries': int,
})
return v.Schema(provider)
def getSupportedLabels(self, pool_name=None):
labels = set()
for pool in self.pools.values():
if not pool_name or (pool.name == pool_name):
labels.update(pool.labels.keys())
return labels

View File

@ -0,0 +1,157 @@
# Copyright 2018 Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import time
from nodepool import exceptions
from nodepool import zk
from nodepool.driver.utils import NodeLauncher
from nodepool.driver import NodeRequestHandler
from nodepool.nodeutils import nodescan
class AwsInstanceLauncher(NodeLauncher):
def __init__(self, handler, node, provider_config, provider_label):
super().__init__(handler.zk, node, provider_config)
self.retries = provider_config.launch_retries
self.pool = provider_config.pools[provider_label.pool.name]
self.handler = handler
self.zk = handler.zk
self.boot_timeout = provider_config.boot_timeout
self.label = provider_label
def launch(self):
self.log.debug("Starting %s instance" % self.node.type)
attempts = 1
while attempts <= self.retries:
try:
instance = self.handler.manager.createInstance(self.label)
break
except Exception:
if attempts <= self.retries:
self.log.exception(
"Launch attempt %d/%d failed for node %s:",
attempts, self.retries, self.node.id)
if attempts == self.retries:
raise
attempts += 1
time.sleep(1)
instance.create_tags(Tags=[{'Key': 'nodepool_id',
'Value': str(self.node.id)}])
instance_id = instance.id
self.node.external_id = instance_id
self.zk.storeNode(self.node)
boot_start = time.monotonic()
while time.monotonic() - boot_start < self.boot_timeout:
state = instance.state.get('Name')
self.log.debug("Instance %s is %s" % (instance_id, state))
if state == 'running':
break
time.sleep(0.5)
instance.reload()
if state != 'running':
raise exceptions.LaunchStatusException(
"Instance %s failed to start: %s" % (instance_id, state))
server_ip = instance.public_ip_address
if not server_ip:
raise exceptions.LaunchStatusException(
"Instance %s doesn't have a public ip" % instance_id)
self.node.connection_port = self.label.cloud_image.connection_port
self.node.connection_type = self.label.cloud_image.connection_type
if self.pool.host_key_checking:
try:
if self.node.connection_type == 'ssh':
gather_hostkeys = True
else:
gather_hostkeys = False
keys = nodescan(server_ip, port=self.node.connection_port,
timeout=180, gather_hostkeys=gather_hostkeys)
except Exception:
raise exceptions.LaunchKeyscanException(
"Can't scan instance %s key" % instance_id)
self.log.info("Instance %s ready" % instance_id)
self.node.state = zk.READY
self.node.external_id = instance_id
self.node.hostname = server_ip
self.node.interface_ip = server_ip
self.node.public_ipv4 = server_ip
self.node.host_keys = keys
self.node.username = self.label.cloud_image.username
self.zk.storeNode(self.node)
self.log.info("Instance %s is ready", instance_id)
class AwsNodeRequestHandler(NodeRequestHandler):
log = logging.getLogger("nodepool.driver.aws."
"AwsNodeRequestHandler")
def __init__(self, pw, request):
super().__init__(pw, request)
self._threads = []
@property
def alive_thread_count(self):
count = 0
for t in self._threads:
if t.isAlive():
count += 1
return count
def imagesAvailable(self):
'''
Determines if the requested images are available for this provider.
:returns: True if it is available, False otherwise.
'''
if self.provider.manage_images:
for label in self.request.node_types:
if self.pool.labels[label].cloud_image:
if not self.manager.labelReady(self.pool.labels[label]):
return False
return True
def launchesComplete(self):
'''
Check if all launch requests have completed.
When all of the Node objects have reached a final state (READY or
FAILED), we'll know all threads have finished the launch process.
'''
if not self._threads:
return True
# Give the NodeLaunch threads time to finish.
if self.alive_thread_count:
return False
node_states = [node.state for node in self.nodeset]
# NOTE: It very important that NodeLauncher always sets one of
# these states, no matter what.
if not all(s in (zk.READY, zk.FAILED) for s in node_states):
return False
return True
def launch(self, node):
label = self.pool.labels[node.type[0]]
thd = AwsInstanceLauncher(self, node, self.provider, label)
thd.start()
self._threads.append(thd)

View File

@ -0,0 +1,151 @@
# Copyright 2018 Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import boto3
from nodepool.driver import Provider
from nodepool.driver.aws.handler import AwsNodeRequestHandler
class AwsInstance:
def __init__(self, name, metadatas, provider):
self.id = name
self.name = name
self.metadata = {}
if metadatas:
for metadata in metadatas:
if metadata["Key"] == "nodepool_id":
self.metadata = {
'nodepool_provider_name': provider.name,
'nodepool_node_id': metadata["Value"],
}
break
def get(self, name, default=None):
return getattr(self, name, default)
class AwsProvider(Provider):
log = logging.getLogger("nodepool.driver.aws.AwsProvider")
def __init__(self, provider, *args):
self.provider = provider
self.ec2 = None
def getRequestHandler(self, poolworker, request):
return AwsNodeRequestHandler(poolworker, request)
def start(self, zk_conn):
if self.ec2 is not None:
return True
self.log.debug("Starting")
self.aws = boto3.Session(
region_name=self.provider.region_name,
profile_name=self.provider.profile_name)
self.ec2 = self.aws.resource('ec2')
def stop(self):
self.log.debug("Stopping")
def listNodes(self):
servers = []
for instance in self.ec2.instances.all():
if instance.state["Name"].lower() == "terminated":
continue
servers.append(AwsInstance(
instance.id, instance.tags, self.provider))
return servers
def getImage(self, image_id):
return self.ec2.Image(image_id)
def labelReady(self, label):
if not label.cloud_image:
msg = "A cloud-image (AMI) must be supplied with the AWS driver."
raise Exception(msg)
image = self.getImage(label.cloud_image.external_name)
# Image loading is deferred, check if it's really there
if image.state != 'available':
self.log.warning(
"Provider %s is configured to use %s as the AMI for"
" label %s and that AMI is there but unavailable in the"
" cloud." % (self.provider.name,
label.cloud_image.external_name,
label.name))
return False
return True
def join(self):
return True
def cleanupLeakedResources(self):
# TODO: remove leaked resources if any
pass
def cleanupNode(self, server_id):
if self.ec2 is None:
return False
instance = self.ec2.Instance(server_id)
instance.terminate()
def waitForNodeCleanup(self, server_id):
# TODO: track instance deletion
return True
def createInstance(self, label):
image_name = label.cloud_image.external_name
args = dict(
ImageId=image_name,
MinCount=1,
MaxCount=1,
KeyName=label.key_name,
InstanceType=label.flavor_name,
NetworkInterfaces=[{
'AssociatePublicIpAddress': True,
'DeviceIndex': 0}])
if label.pool.security_group_id:
args['NetworkInterfaces'][0]['Groups'] = [
label.pool.security_group_id
]
if label.pool.subnet_id:
args['NetworkInterfaces'][0]['SubnetId'] = label.pool.subnet_id
# Default block device mapping parameters are embedded in AMIs.
# We might need to supply our own mapping before lauching the instance.
# We basically want to make sure DeleteOnTermination is true and be
# able to set the volume type and size.
image = self.getImage(image_name)
# TODO: Flavors can also influence whether or not the VM spawns with a
# volume -- we basically need to ensure DeleteOnTermination is true
if hasattr(image, 'block_device_mappings'):
bdm = image.block_device_mappings
mapping = bdm[0]
if 'Ebs' in mapping:
mapping['Ebs']['DeleteOnTermination'] = True
if label.volume_size:
mapping['Ebs']['VolumeSize'] = label.volume_size
if label.volume_type:
mapping['Ebs']['VolumeType'] = label.volume_type
# If the AMI is a snapshot, we cannot supply an "encrypted"
# parameter
if 'Encrypted' in mapping['Ebs']:
del mapping['Ebs']['Encrypted']
args['BlockDeviceMappings'] = [mapping]
instances = self.ec2.create_instances(**args)
return self.ec2.Instance(instances[0].id)

View File

@ -334,19 +334,22 @@ class DBTestCase(BaseTestCase):
self.useFixture(images_dir)
build_log_dir = fixtures.TempDir()
self.useFixture(build_log_dir)
configfile = os.path.join(os.path.dirname(__file__),
'fixtures', filename)
(fd, path) = tempfile.mkstemp()
with open(configfile, 'rb') as conf_fd:
config = conf_fd.read().decode('utf8')
data = config.format(images_dir=images_dir.path,
build_log_dir=build_log_dir.path,
context_name=context_name,
zookeeper_host=self.zookeeper_host,
zookeeper_port=self.zookeeper_port,
zookeeper_chroot=self.zookeeper_chroot)
os.write(fd, data.encode('utf8'))
os.close(fd)
if filename.startswith('/'):
path = filename
else:
configfile = os.path.join(os.path.dirname(__file__),
'fixtures', filename)
(fd, path) = tempfile.mkstemp()
with open(configfile, 'rb') as conf_fd:
config = conf_fd.read().decode('utf8')
data = config.format(images_dir=images_dir.path,
build_log_dir=build_log_dir.path,
context_name=context_name,
zookeeper_host=self.zookeeper_host,
zookeeper_port=self.zookeeper_port,
zookeeper_chroot=self.zookeeper_chroot)
os.write(fd, data.encode('utf8'))
os.close(fd)
self._config_images_dir = images_dir
self._config_build_log_dir = build_log_dir
validator = ConfigValidator(path)

26
nodepool/tests/fixtures/aws.yaml vendored Normal file
View File

@ -0,0 +1,26 @@
zookeeper-servers:
- host: null
port: null
chroot: null
labels:
- name: ubuntu1404
providers:
- name: ec2-us-west-2
driver: aws
region-name: us-west-2
cloud-images:
- name: ubuntu1404
image-id: ami-1e749f67
username: ubuntu
pools:
- name: main
max-servers: 5
subnet-id: null
security-group-id: null
labels:
- name: ubuntu1404
cloud-image: ubuntu1404
flavor-name: t3.medium
key-name: zuul

View File

@ -23,6 +23,7 @@ labels:
- name: pod-fedora
- name: openshift-project
- name: openshift-pod
- name: centos-ami
providers:
- name: cloud1
@ -138,6 +139,27 @@ providers:
memory: 512
cpu: 2
- name: ec2-us-east-2
driver: aws
region-name: us-east-2
profile-name: default
cloud-images:
- name: centos-ami
image-id: ami-cfdafaaa
username: centos
pools:
- name: main
max-servers: 42
security-group-id: sg-8bfe86352e334a80a
subnet-id: subnet-bb3605b5f0fa40e1b
labels:
- name: centos-ami
cloud-image: centos-ami
flavor-name: t2.micro
key-name: zuul
volume-type: gp2
volume-size: 80
diskimages:
- name: trusty
formats:

View File

@ -0,0 +1,95 @@
# Copyright (C) 2018 Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import fixtures
import logging
import os
import tempfile
from unittest.mock import patch
import boto3
from moto import mock_ec2
import yaml
from nodepool import tests
from nodepool import zk
class TestDriverAws(tests.DBTestCase):
log = logging.getLogger("nodepool.TestDriverAws")
@mock_ec2
def test_ec2_machine(self):
aws_id = 'AK000000000000000000'
aws_key = '0123456789abcdef0123456789abcdef0123456789abcdef'
self.useFixture(
fixtures.EnvironmentVariable('AWS_ACCESS_KEY_ID', aws_id))
self.useFixture(
fixtures.EnvironmentVariable('AWS_SECRET_ACCESS_KEY', aws_key))
ec2 = boto3.client('ec2', region_name='us-west-2')
# TEST-NET-3
vpc = ec2.create_vpc(CidrBlock='203.0.113.0/24')
subnet = ec2.create_subnet(
CidrBlock='203.0.113.128/25', VpcId=vpc['Vpc']['VpcId'])
subnet_id = subnet['Subnet']['SubnetId']
sg = ec2.create_security_group(
GroupName='zuul-nodes', VpcId=vpc['Vpc']['VpcId'],
Description='Zuul Nodes')
sg_id = sg['GroupId']
ec2_template = os.path.join(
os.path.dirname(__file__), '..', 'fixtures', 'aws.yaml')
raw_config = yaml.safe_load(open(ec2_template))
raw_config['zookeeper-servers'][0] = {
'host': self.zookeeper_host,
'port': self.zookeeper_port,
'chroot': self.zookeeper_chroot,
}
raw_config['providers'][0]['pools'][0]['subnet-id'] = subnet_id
raw_config['providers'][0]['pools'][0]['security-group-id'] = sg_id
with tempfile.NamedTemporaryFile() as tf:
tf.write(yaml.safe_dump(
raw_config, default_flow_style=False).encode('utf-8'))
tf.flush()
configfile = self.setup_config(tf.name)
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('ubuntu1404')
with patch('nodepool.driver.aws.handler.nodescan') as nodescan:
nodescan.return_value = 'MOCK KEY'
self.zk.storeNodeRequest(req)
self.log.debug("Waiting for request %s", req.id)
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FULFILLED)
self.assertNotEqual(req.nodes, [])
node = self.zk.getNode(req.nodes[0])
self.assertEqual(node.allocated_to, req.id)
self.assertEqual(node.state, zk.READY)
self.assertIsNotNone(node.launcher)
self.assertEqual(node.connection_type, 'ssh')
nodescan.assert_called_with(
node.interface_ip, port=22, timeout=180, gather_hostkeys=True)
node.state = zk.DELETING
self.zk.storeNode(node)
self.waitForNodeDeletion(node)

View File

@ -0,0 +1,7 @@
---
prelude: Amazon Web Services (AWS) EC2 Driver
features:
- The new Amazon Web Services (AWS) EC2 Driver allows launching EC2 instances as nodes.
issues:
- The AWS driver does not support quota management at this time.
- The AWS driver does not support custom image building.

View File

@ -14,3 +14,4 @@ kazoo
Paste
WebOb>=1.8.1
openshift
boto3

View File

@ -7,3 +7,4 @@ python-subunit
stestr>=1.0.0 # Apache-2.0
testscenarios
testtools>=0.9.27
moto