Many strategies execute very similar statements especially in pre_execute and some might raise errors that others might not. This same pattern of many similar statements can also be observed in strategies their tests. This patch addresses these issues, firstly; the BaseStrategy class gets 1 additional method _pre_execute which allows for general logic that most strategies perform at that stage. This method can be executed before the similarly named method of the superclass. A notable change is that _pre_execute now handles common exception handling for ClusterStateStale & ClusterStateNotDefined exceptions. A similar pattern is applied to the test classes of the strategies each of these classes now inherits from the TestBaseStrategy class. This class provides the common attributes almost every test class for the strategies requires such as: The mocked compute_model, mocked audit_scope and an instance of FakerModelCollector. Finally, some minor changes were required in test_strategy_context & test_audit_handlers and exceptions around 0 nodes in cluster or storage are removed. Change-Id: Ia7154376b2448aac65cf17999cc8c3e1c8309b5b
326 lines
12 KiB
Python
326 lines
12 KiB
Python
# -*- encoding: utf-8 -*-
|
|
# Copyright (c) 2017 chinac.com
|
|
#
|
|
# Authors: suzhengwei<suzhengwei@chinac.com>
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from oslo_log import log
|
|
import six
|
|
|
|
from watcher._i18n import _
|
|
from watcher.common import exception
|
|
from watcher.decision_engine.model import element
|
|
from watcher.decision_engine.strategy.strategies import base
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
|
|
class HostMaintenance(base.HostMaintenanceBaseStrategy):
|
|
"""[PoC]Host Maintenance
|
|
|
|
*Description*
|
|
|
|
It is a migration strategy for one compute node maintenance,
|
|
without having the user's application been interruptted.
|
|
If given one backup node, the strategy will firstly
|
|
migrate all instances from the maintenance node to
|
|
the backup node. If the backup node is not provided,
|
|
it will migrate all instances, relying on nova-scheduler.
|
|
|
|
*Requirements*
|
|
|
|
* You must have at least 2 physical compute nodes to run this strategy.
|
|
|
|
*Limitations*
|
|
|
|
- This is a proof of concept that is not meant to be used in production
|
|
- It migrates all instances from one host to other hosts. It's better to
|
|
execute such strategy when load is not heavy, and use this algorithm
|
|
with `ONESHOT` audit.
|
|
- It assumes that cold and live migrations are possible.
|
|
"""
|
|
|
|
INSTANCE_MIGRATION = "migrate"
|
|
CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state"
|
|
REASON_FOR_DISABLE = 'watcher_disabled'
|
|
|
|
def __init__(self, config, osc=None):
|
|
super(HostMaintenance, self).__init__(config, osc)
|
|
|
|
@classmethod
|
|
def get_name(cls):
|
|
return "host_maintenance"
|
|
|
|
@classmethod
|
|
def get_display_name(cls):
|
|
return _("Host Maintenance Strategy")
|
|
|
|
@classmethod
|
|
def get_translatable_display_name(cls):
|
|
return "Host Maintenance Strategy"
|
|
|
|
@classmethod
|
|
def get_schema(cls):
|
|
return {
|
|
"properties": {
|
|
"maintenance_node": {
|
|
"description": "The name of the compute node which "
|
|
"need maintenance",
|
|
"type": "string",
|
|
},
|
|
"backup_node": {
|
|
"description": "The name of the compute node which "
|
|
"will backup the maintenance node.",
|
|
"type": "string",
|
|
},
|
|
},
|
|
"required": ["maintenance_node"],
|
|
}
|
|
|
|
def get_disabled_compute_nodes_with_reason(self, reason=None):
|
|
return {uuid: cn for uuid, cn in
|
|
self.compute_model.get_all_compute_nodes().items()
|
|
if cn.state == element.ServiceState.ONLINE.value and
|
|
cn.status == element.ServiceState.DISABLED.value and
|
|
cn.disabled_reason == reason}
|
|
|
|
def get_disabled_compute_nodes(self):
|
|
return self.get_disabled_compute_nodes_with_reason(
|
|
self.REASON_FOR_DISABLE)
|
|
|
|
def get_instance_state_str(self, instance):
|
|
"""Get instance state in string format"""
|
|
if isinstance(instance.state, six.string_types):
|
|
return instance.state
|
|
elif isinstance(instance.state, element.InstanceState):
|
|
return instance.state.value
|
|
else:
|
|
LOG.error('Unexpected instance state type, '
|
|
'state=%(state)s, state_type=%(st)s.',
|
|
dict(state=instance.state,
|
|
st=type(instance.state)))
|
|
raise exception.WatcherException
|
|
|
|
def get_node_status_str(self, node):
|
|
"""Get node status in string format"""
|
|
if isinstance(node.status, six.string_types):
|
|
return node.status
|
|
elif isinstance(node.status, element.ServiceState):
|
|
return node.status.value
|
|
else:
|
|
LOG.error('Unexpected node status type, '
|
|
'status=%(status)s, status_type=%(st)s.',
|
|
dict(status=node.status,
|
|
st=type(node.status)))
|
|
raise exception.WatcherException
|
|
|
|
def get_node_capacity(self, node):
|
|
"""Collect cpu, ram and disk capacity of a node.
|
|
|
|
:param node: node object
|
|
:return: dict(cpu(cores), ram(MB), disk(B))
|
|
"""
|
|
return dict(cpu=node.vcpus,
|
|
ram=node.memory,
|
|
disk=node.disk_capacity)
|
|
|
|
def get_node_used(self, node):
|
|
"""Collect cpu, ram and disk used of a node.
|
|
|
|
:param node: node object
|
|
:return: dict(cpu(cores), ram(MB), disk(B))
|
|
"""
|
|
vcpus_used = 0
|
|
memory_used = 0
|
|
disk_used = 0
|
|
for instance in self.compute_model.get_node_instances(node):
|
|
vcpus_used += instance.vcpus
|
|
memory_used += instance.memory
|
|
disk_used += instance.disk
|
|
|
|
return dict(cpu=vcpus_used,
|
|
ram=memory_used,
|
|
disk=disk_used)
|
|
|
|
def get_node_free(self, node):
|
|
"""Collect cpu, ram and disk free of a node.
|
|
|
|
:param node: node object
|
|
:return: dict(cpu(cores), ram(MB), disk(B))
|
|
"""
|
|
node_capacity = self.get_node_capacity(node)
|
|
node_used = self.get_node_used(node)
|
|
return dict(cpu=node_capacity['cpu']-node_used['cpu'],
|
|
ram=node_capacity['ram']-node_used['ram'],
|
|
disk=node_capacity['disk']-node_used['disk'],
|
|
)
|
|
|
|
def host_fits(self, source_node, destination_node):
|
|
"""check host fits
|
|
|
|
return True if VMs could intensively migrate
|
|
from source_node to destination_node.
|
|
"""
|
|
|
|
source_node_used = self.get_node_used(source_node)
|
|
destination_node_free = self.get_node_free(destination_node)
|
|
metrics = ['cpu', 'ram']
|
|
for m in metrics:
|
|
if source_node_used[m] > destination_node_free[m]:
|
|
return False
|
|
return True
|
|
|
|
def add_action_enable_compute_node(self, node):
|
|
"""Add an action for node enabler into the solution."""
|
|
params = {'state': element.ServiceState.ENABLED.value}
|
|
self.solution.add_action(
|
|
action_type=self.CHANGE_NOVA_SERVICE_STATE,
|
|
resource_id=node.uuid,
|
|
input_parameters=params)
|
|
|
|
def add_action_maintain_compute_node(self, node):
|
|
"""Add an action for node maintenance into the solution."""
|
|
params = {'state': element.ServiceState.DISABLED.value,
|
|
'disabled_reason': self.REASON_FOR_MAINTAINING}
|
|
self.solution.add_action(
|
|
action_type=self.CHANGE_NOVA_SERVICE_STATE,
|
|
resource_id=node.uuid,
|
|
input_parameters=params)
|
|
|
|
def enable_compute_node_if_disabled(self, node):
|
|
node_status_str = self.get_node_status_str(node)
|
|
if node_status_str != element.ServiceState.ENABLED.value:
|
|
self.add_action_enable_compute_node(node)
|
|
|
|
def instance_migration(self, instance, src_node, des_node=None):
|
|
"""Add an action for instance migration into the solution.
|
|
|
|
:param instance: instance object
|
|
:param src_node: node object
|
|
:param des_node: node object. if None, the instance will be
|
|
migrated relying on nova-scheduler
|
|
:return: None
|
|
"""
|
|
instance_state_str = self.get_instance_state_str(instance)
|
|
if instance_state_str == element.InstanceState.ACTIVE.value:
|
|
migration_type = 'live'
|
|
else:
|
|
migration_type = 'cold'
|
|
|
|
params = {'migration_type': migration_type,
|
|
'source_node': src_node.uuid}
|
|
if des_node:
|
|
params['destination_node'] = des_node.uuid
|
|
self.solution.add_action(action_type=self.INSTANCE_MIGRATION,
|
|
resource_id=instance.uuid,
|
|
input_parameters=params)
|
|
|
|
def host_migration(self, source_node, destination_node):
|
|
"""host migration
|
|
|
|
Migrate all instances from source_node to destination_node.
|
|
Active instances use "live-migrate",
|
|
and other instances use "cold-migrate"
|
|
"""
|
|
instances = self.compute_model.get_node_instances(source_node)
|
|
for instance in instances:
|
|
self.instance_migration(instance, source_node, destination_node)
|
|
|
|
def safe_maintain(self, maintenance_node, backup_node=None):
|
|
"""safe maintain one compute node
|
|
|
|
Migrate all instances of the maintenance_node intensively to the
|
|
backup host. If the user didn't give the backup host, it will
|
|
select one unused node to backup the maintaining node.
|
|
|
|
It calculate the resource both of the backup node and maintaining
|
|
node to evaluate the migrations from maintaining node to backup node.
|
|
If all instances of the maintaining node can migrated to
|
|
the backup node, it will set the maintaining node in
|
|
'watcher_maintaining' status, and add the migrations to solution.
|
|
"""
|
|
# If the user gives a backup node with required capacity, then migrates
|
|
# all instances from the maintaining node to the backup node.
|
|
if backup_node:
|
|
if self.host_fits(maintenance_node, backup_node):
|
|
self.enable_compute_node_if_disabled(backup_node)
|
|
self.add_action_maintain_compute_node(maintenance_node)
|
|
self.host_migration(maintenance_node, backup_node)
|
|
return True
|
|
|
|
# If the user didn't give the backup host, select one unused
|
|
# node with required capacity, then migrates all instances
|
|
# from maintaining node to it.
|
|
nodes = sorted(
|
|
self.get_disabled_compute_nodes().values(),
|
|
key=lambda x: self.get_node_capacity(x)['cpu'])
|
|
if maintenance_node in nodes:
|
|
nodes.remove(maintenance_node)
|
|
|
|
for node in nodes:
|
|
if self.host_fits(maintenance_node, node):
|
|
self.enable_compute_node_if_disabled(node)
|
|
self.add_action_maintain_compute_node(maintenance_node)
|
|
self.host_migration(maintenance_node, node)
|
|
return True
|
|
|
|
return False
|
|
|
|
def try_maintain(self, maintenance_node):
|
|
"""try to maintain one compute node
|
|
|
|
It firstly set the maintenance_node in 'watcher_maintaining' status.
|
|
Then try to migrate all instances of the maintenance node, rely
|
|
on nova-scheduler.
|
|
"""
|
|
self.add_action_maintain_compute_node(maintenance_node)
|
|
instances = self.compute_model.get_node_instances(maintenance_node)
|
|
for instance in instances:
|
|
self.instance_migration(instance, maintenance_node)
|
|
|
|
def pre_execute(self):
|
|
self._pre_execute()
|
|
|
|
def do_execute(self):
|
|
LOG.info(_('Executing Host Maintenance Migration Strategy'))
|
|
|
|
maintenance_node = self.input_parameters.get('maintenance_node')
|
|
backup_node = self.input_parameters.get('backup_node')
|
|
|
|
# if no VMs in the maintenance_node, just maintain the compute node
|
|
src_node = self.compute_model.get_node_by_uuid(maintenance_node)
|
|
if len(self.compute_model.get_node_instances(src_node)) == 0:
|
|
if (src_node.disabled_reason !=
|
|
self.REASON_FOR_MAINTAINING):
|
|
self.add_action_maintain_compute_node(src_node)
|
|
return
|
|
|
|
if backup_node:
|
|
des_node = self.compute_model.get_node_by_uuid(backup_node)
|
|
else:
|
|
des_node = None
|
|
|
|
if not self.safe_maintain(src_node, des_node):
|
|
self.try_maintain(src_node)
|
|
|
|
def post_execute(self):
|
|
"""Post-execution phase
|
|
|
|
This can be used to compute the global efficacy
|
|
"""
|
|
LOG.debug(self.solution.actions)
|
|
LOG.debug(self.compute_model.to_string())
|