In this changeset, I implemented a background scheduler service for Watcher and more particularly for the Decision Engine where I made it create 2 types of job per cluster data model collector plugin: - An initial job that is asynchronously executed upon starting the Decision Engine - A periodical job that gets triggered every configurable interval of time Change-Id: I3f5442f81933a19565217b894bd86c186e339762 Partially-Implements: bluprint cluster-model-objects-wrapper
547 lines
22 KiB
Python
547 lines
22 KiB
Python
# -*- encoding: utf-8 -*-
|
|
#
|
|
# Authors: Vojtech CIMA <cima@zhaw.ch>
|
|
# Bruno GRAZIOLI <gaea@zhaw.ch>
|
|
# Sean MURPHY <murp@zhaw.ch>
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from oslo_log import log
|
|
import six
|
|
|
|
from watcher._i18n import _, _LE, _LI
|
|
from watcher.common import exception
|
|
from watcher.decision_engine.model import hypervisor_state as hyper_state
|
|
from watcher.decision_engine.model import resource
|
|
from watcher.decision_engine.model import vm_state
|
|
from watcher.decision_engine.strategy.strategies import base
|
|
from watcher.metrics_engine.cluster_history import ceilometer \
|
|
as ceilometer_cluster_history
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
|
|
class VMWorkloadConsolidation(base.ServerConsolidationBaseStrategy):
|
|
"""VM Workload Consolidation Strategy.
|
|
|
|
*Description*
|
|
|
|
A load consolidation strategy based on heuristic first-fit
|
|
algorithm which focuses on measured CPU utilization and tries to
|
|
minimize hosts which have too much or too little load respecting
|
|
resource capacity constraints.
|
|
|
|
This strategy produces a solution resulting in more efficient
|
|
utilization of cluster resources using following four phases:
|
|
|
|
* Offload phase - handling over-utilized resources
|
|
* Consolidation phase - handling under-utilized resources
|
|
* Solution optimization - reducing number of migrations
|
|
* Disability of unused hypervisors
|
|
|
|
A capacity coefficients (cc) might be used to adjust optimization
|
|
thresholds. Different resources may require different coefficient
|
|
values as well as setting up different coefficient values in both
|
|
phases may lead to more efficient consolidation in the end.
|
|
If the cc equals 1 the full resource capacity may be used, cc
|
|
values lower than 1 will lead to resource under utilization and
|
|
values higher than 1 will lead to resource overbooking.
|
|
e.g. If targeted utilization is 80 percent of hypervisor capacity,
|
|
the coefficient in the consolidation phase will be 0.8, but
|
|
may any lower value in the offloading phase. The lower it gets
|
|
the cluster will appear more released (distributed) for the
|
|
following consolidation phase.
|
|
|
|
As this strategy leverages VM live migration to move the load
|
|
from one hypervisor to another, this feature needs to be set up
|
|
correctly on all hypervisors within the cluster.
|
|
This strategy assumes it is possible to live migrate any VM from
|
|
an active hypervisor to any other active hypervisor.
|
|
|
|
*Requirements*
|
|
|
|
* You must have at least 2 physical compute nodes to run this strategy.
|
|
|
|
*Limitations*
|
|
|
|
<None>
|
|
|
|
*Spec URL*
|
|
|
|
https://github.com/openstack/watcher-specs/blob/master/specs/mitaka/implemented/zhaw-load-consolidation.rst
|
|
""" # noqa
|
|
|
|
def __init__(self, config, osc=None):
|
|
super(VMWorkloadConsolidation, self).__init__(config, osc)
|
|
self._ceilometer = None
|
|
self.number_of_migrations = 0
|
|
self.number_of_released_hypervisors = 0
|
|
self.ceilometer_vm_data_cache = dict()
|
|
|
|
@classmethod
|
|
def get_name(cls):
|
|
return "vm_workload_consolidation"
|
|
|
|
@classmethod
|
|
def get_display_name(cls):
|
|
return _("VM Workload Consolidation Strategy")
|
|
|
|
@classmethod
|
|
def get_translatable_display_name(cls):
|
|
return "VM Workload Consolidation Strategy"
|
|
|
|
@property
|
|
def ceilometer(self):
|
|
if self._ceilometer is None:
|
|
self._ceilometer = (ceilometer_cluster_history.
|
|
CeilometerClusterHistory(osc=self.osc))
|
|
return self._ceilometer
|
|
|
|
@ceilometer.setter
|
|
def ceilometer(self, ceilometer):
|
|
self._ceilometer = ceilometer
|
|
|
|
def get_state_str(self, state):
|
|
"""Get resource state in string format.
|
|
|
|
:param state: resource state of unknown type
|
|
"""
|
|
if isinstance(state, six.string_types):
|
|
return state
|
|
elif isinstance(state, (vm_state.VMState,
|
|
hyper_state.HypervisorState)):
|
|
return state.value
|
|
else:
|
|
LOG.error(_LE('Unexpected resource state type, '
|
|
'state=%(state)s, state_type=%(st)s.'),
|
|
state=state,
|
|
st=type(state))
|
|
raise exception.WatcherException
|
|
|
|
def add_action_enable_hypervisor(self, hypervisor):
|
|
"""Add an action for hypervisor enabler into the solution.
|
|
|
|
:param hypervisor: hypervisor object
|
|
:return: None
|
|
"""
|
|
params = {'state': hyper_state.HypervisorState.ENABLED.value}
|
|
self.solution.add_action(
|
|
action_type='change_nova_service_state',
|
|
resource_id=hypervisor.uuid,
|
|
input_parameters=params)
|
|
self.number_of_released_hypervisors -= 1
|
|
|
|
def add_action_disable_hypervisor(self, hypervisor):
|
|
"""Add an action for hypervisor disablity into the solution.
|
|
|
|
:param hypervisor: hypervisor object
|
|
:return: None
|
|
"""
|
|
params = {'state': hyper_state.HypervisorState.DISABLED.value}
|
|
self.solution.add_action(
|
|
action_type='change_nova_service_state',
|
|
resource_id=hypervisor.uuid,
|
|
input_parameters=params)
|
|
self.number_of_released_hypervisors += 1
|
|
|
|
def add_migration(self, vm_uuid, src_hypervisor,
|
|
dst_hypervisor, model):
|
|
"""Add an action for VM migration into the solution.
|
|
|
|
:param vm_uuid: vm uuid
|
|
:param src_hypervisor: hypervisor object
|
|
:param dst_hypervisor: hypervisor object
|
|
:param model: model_root object
|
|
:return: None
|
|
"""
|
|
vm = model.get_vm_from_id(vm_uuid)
|
|
|
|
vm_state_str = self.get_state_str(vm.state)
|
|
if vm_state_str != vm_state.VMState.ACTIVE.value:
|
|
# Watcher curently only supports live VM migration and block live
|
|
# VM migration which both requires migrated VM to be active.
|
|
# When supported, the cold migration may be used as a fallback
|
|
# migration mechanism to move non active VMs.
|
|
LOG.error(_LE('Cannot live migrate: vm_uuid=%(vm_uuid)s, '
|
|
'state=%(vm_state)s.'),
|
|
vm_uuid=vm_uuid,
|
|
vm_state=vm_state_str)
|
|
raise exception.WatcherException
|
|
|
|
migration_type = 'live'
|
|
|
|
dst_hyper_state_str = self.get_state_str(dst_hypervisor.state)
|
|
if dst_hyper_state_str == hyper_state.HypervisorState.DISABLED.value:
|
|
self.add_action_enable_hypervisor(dst_hypervisor)
|
|
model.get_mapping().unmap(src_hypervisor, vm)
|
|
model.get_mapping().map(dst_hypervisor, vm)
|
|
|
|
params = {'migration_type': migration_type,
|
|
'src_hypervisor': src_hypervisor.uuid,
|
|
'dst_hypervisor': dst_hypervisor.uuid}
|
|
self.solution.add_action(action_type='migrate',
|
|
resource_id=vm.uuid,
|
|
input_parameters=params)
|
|
self.number_of_migrations += 1
|
|
|
|
def disable_unused_hypervisors(self, model):
|
|
"""Generate actions for disablity of unused hypervisors.
|
|
|
|
:param model: model_root object
|
|
:return: None
|
|
"""
|
|
for hypervisor in model.get_all_hypervisors().values():
|
|
if (len(model.get_mapping().get_node_vms(hypervisor)) == 0 and
|
|
hypervisor.status !=
|
|
hyper_state.HypervisorState.DISABLED.value):
|
|
self.add_action_disable_hypervisor(hypervisor)
|
|
|
|
def get_vm_utilization(self, vm_uuid, model, period=3600, aggr='avg'):
|
|
"""Collect cpu, ram and disk utilization statistics of a VM.
|
|
|
|
:param vm_uuid: vm object
|
|
:param model: model_root object
|
|
:param period: seconds
|
|
:param aggr: string
|
|
:return: dict(cpu(number of vcpus used), ram(MB used), disk(B used))
|
|
"""
|
|
if vm_uuid in self.ceilometer_vm_data_cache.keys():
|
|
return self.ceilometer_vm_data_cache.get(vm_uuid)
|
|
|
|
cpu_util_metric = 'cpu_util'
|
|
ram_util_metric = 'memory.usage'
|
|
|
|
ram_alloc_metric = 'memory'
|
|
disk_alloc_metric = 'disk.root.size'
|
|
vm_cpu_util = self.ceilometer.statistic_aggregation(
|
|
resource_id=vm_uuid, meter_name=cpu_util_metric,
|
|
period=period, aggregate=aggr)
|
|
vm_cpu_cores = model.get_resource_from_id(
|
|
resource.ResourceType.cpu_cores).get_capacity(
|
|
model.get_vm_from_id(vm_uuid))
|
|
|
|
if vm_cpu_util:
|
|
total_cpu_utilization = vm_cpu_cores * (vm_cpu_util / 100.0)
|
|
else:
|
|
total_cpu_utilization = vm_cpu_cores
|
|
|
|
vm_ram_util = self.ceilometer.statistic_aggregation(
|
|
resource_id=vm_uuid, meter_name=ram_util_metric,
|
|
period=period, aggregate=aggr)
|
|
|
|
if not vm_ram_util:
|
|
vm_ram_util = self.ceilometer.statistic_aggregation(
|
|
resource_id=vm_uuid, meter_name=ram_alloc_metric,
|
|
period=period, aggregate=aggr)
|
|
|
|
vm_disk_util = self.ceilometer.statistic_aggregation(
|
|
resource_id=vm_uuid, meter_name=disk_alloc_metric,
|
|
period=period, aggregate=aggr)
|
|
|
|
if not vm_ram_util or not vm_disk_util:
|
|
LOG.error(
|
|
_LE('No values returned by %(resource_id)s '
|
|
'for memory.usage or disk.root.size'),
|
|
resource_id=vm_uuid
|
|
)
|
|
raise exception.NoDataFound
|
|
|
|
self.ceilometer_vm_data_cache[vm_uuid] = dict(
|
|
cpu=total_cpu_utilization, ram=vm_ram_util, disk=vm_disk_util)
|
|
return self.ceilometer_vm_data_cache.get(vm_uuid)
|
|
|
|
def get_hypervisor_utilization(self, hypervisor, model, period=3600,
|
|
aggr='avg'):
|
|
"""Collect cpu, ram and disk utilization statistics of a hypervisor.
|
|
|
|
:param hypervisor: hypervisor object
|
|
:param model: model_root object
|
|
:param period: seconds
|
|
:param aggr: string
|
|
:return: dict(cpu(number of cores used), ram(MB used), disk(B used))
|
|
"""
|
|
hypervisor_vms = model.get_mapping().get_node_vms_from_id(
|
|
hypervisor.uuid)
|
|
hypervisor_ram_util = 0
|
|
hypervisor_disk_util = 0
|
|
hypervisor_cpu_util = 0
|
|
for vm_uuid in hypervisor_vms:
|
|
vm_util = self.get_vm_utilization(vm_uuid, model, period, aggr)
|
|
hypervisor_cpu_util += vm_util['cpu']
|
|
hypervisor_ram_util += vm_util['ram']
|
|
hypervisor_disk_util += vm_util['disk']
|
|
|
|
return dict(cpu=hypervisor_cpu_util, ram=hypervisor_ram_util,
|
|
disk=hypervisor_disk_util)
|
|
|
|
def get_hypervisor_capacity(self, hypervisor, model):
|
|
"""Collect cpu, ram and disk capacity of a hypervisor.
|
|
|
|
:param hypervisor: hypervisor object
|
|
:param model: model_root object
|
|
:return: dict(cpu(cores), ram(MB), disk(B))
|
|
"""
|
|
hypervisor_cpu_capacity = model.get_resource_from_id(
|
|
resource.ResourceType.cpu_cores).get_capacity(hypervisor)
|
|
|
|
hypervisor_disk_capacity = model.get_resource_from_id(
|
|
resource.ResourceType.disk_capacity).get_capacity(hypervisor)
|
|
|
|
hypervisor_ram_capacity = model.get_resource_from_id(
|
|
resource.ResourceType.memory).get_capacity(hypervisor)
|
|
return dict(cpu=hypervisor_cpu_capacity, ram=hypervisor_ram_capacity,
|
|
disk=hypervisor_disk_capacity)
|
|
|
|
def get_relative_hypervisor_utilization(self, hypervisor, model):
|
|
"""Return relative hypervisor utilization (rhu).
|
|
|
|
:param hypervisor: hypervisor object
|
|
:param model: model_root object
|
|
:return: {'cpu': <0,1>, 'ram': <0,1>, 'disk': <0,1>}
|
|
"""
|
|
rhu = {}
|
|
util = self.get_hypervisor_utilization(hypervisor, model)
|
|
cap = self.get_hypervisor_capacity(hypervisor, model)
|
|
for k in util.keys():
|
|
rhu[k] = float(util[k]) / float(cap[k])
|
|
return rhu
|
|
|
|
def get_relative_cluster_utilization(self, model):
|
|
"""Calculate relative cluster utilization (rcu).
|
|
|
|
RCU is an average of relative utilizations (rhu) of active hypervisors.
|
|
:param model: model_root object
|
|
:return: {'cpu': <0,1>, 'ram': <0,1>, 'disk': <0,1>}
|
|
"""
|
|
hypervisors = model.get_all_hypervisors().values()
|
|
rcu = {}
|
|
counters = {}
|
|
for hypervisor in hypervisors:
|
|
hyper_state_str = self.get_state_str(hypervisor.state)
|
|
if hyper_state_str == hyper_state.HypervisorState.ENABLED.value:
|
|
rhu = self.get_relative_hypervisor_utilization(
|
|
hypervisor, model)
|
|
for k in rhu.keys():
|
|
if k not in rcu:
|
|
rcu[k] = 0
|
|
if k not in counters:
|
|
counters[k] = 0
|
|
rcu[k] += rhu[k]
|
|
counters[k] += 1
|
|
for k in rcu.keys():
|
|
rcu[k] /= counters[k]
|
|
return rcu
|
|
|
|
def is_overloaded(self, hypervisor, model, cc):
|
|
"""Indicate whether a hypervisor is overloaded.
|
|
|
|
This considers provided resource capacity coefficients (cc).
|
|
:param hypervisor: hypervisor object
|
|
:param model: model_root object
|
|
:param cc: dictionary containing resource capacity coefficients
|
|
:return: [True, False]
|
|
"""
|
|
hypervisor_capacity = self.get_hypervisor_capacity(hypervisor, model)
|
|
hypervisor_utilization = self.get_hypervisor_utilization(
|
|
hypervisor, model)
|
|
metrics = ['cpu']
|
|
for m in metrics:
|
|
if hypervisor_utilization[m] > hypervisor_capacity[m] * cc[m]:
|
|
return True
|
|
return False
|
|
|
|
def vm_fits(self, vm_uuid, hypervisor, model, cc):
|
|
"""Indicate whether is a hypervisor able to accommodate a VM.
|
|
|
|
This considers provided resource capacity coefficients (cc).
|
|
:param vm_uuid: string
|
|
:param hypervisor: hypervisor object
|
|
:param model: model_root object
|
|
:param cc: dictionary containing resource capacity coefficients
|
|
:return: [True, False]
|
|
"""
|
|
hypervisor_capacity = self.get_hypervisor_capacity(hypervisor, model)
|
|
hypervisor_utilization = self.get_hypervisor_utilization(
|
|
hypervisor, model)
|
|
vm_utilization = self.get_vm_utilization(vm_uuid, model)
|
|
metrics = ['cpu', 'ram', 'disk']
|
|
for m in metrics:
|
|
if (vm_utilization[m] + hypervisor_utilization[m] >
|
|
hypervisor_capacity[m] * cc[m]):
|
|
return False
|
|
return True
|
|
|
|
def optimize_solution(self, model):
|
|
"""Optimize solution.
|
|
|
|
This is done by eliminating unnecessary or circular set of migrations
|
|
which can be replaced by a more efficient solution.
|
|
e.g.:
|
|
|
|
* A->B, B->C => replace migrations A->B, B->C with
|
|
a single migration A->C as both solution result in
|
|
VM running on hypervisor C which can be achieved with
|
|
one migration instead of two.
|
|
* A->B, B->A => remove A->B and B->A as they do not result
|
|
in a new VM placement.
|
|
|
|
:param model: model_root object
|
|
"""
|
|
migrate_actions = (
|
|
a for a in self.solution.actions if a[
|
|
'action_type'] == 'migrate')
|
|
vm_to_be_migrated = (a['input_parameters']['resource_id']
|
|
for a in migrate_actions)
|
|
vm_uuids = list(set(vm_to_be_migrated))
|
|
for vm_uuid in vm_uuids:
|
|
actions = list(
|
|
a for a in self.solution.actions if a[
|
|
'input_parameters'][
|
|
'resource_id'] == vm_uuid)
|
|
if len(actions) > 1:
|
|
src = actions[0]['input_parameters']['src_hypervisor']
|
|
dst = actions[-1]['input_parameters']['dst_hypervisor']
|
|
for a in actions:
|
|
self.solution.actions.remove(a)
|
|
self.number_of_migrations -= 1
|
|
if src != dst:
|
|
self.add_migration(vm_uuid, src, dst, model)
|
|
|
|
def offload_phase(self, model, cc):
|
|
"""Perform offloading phase.
|
|
|
|
This considers provided resource capacity coefficients.
|
|
Offload phase performing first-fit based bin packing to offload
|
|
overloaded hypervisors. This is done in a fashion of moving
|
|
the least CPU utilized VM first as live migration these
|
|
generaly causes less troubles. This phase results in a cluster
|
|
with no overloaded hypervisors.
|
|
* This phase is be able to enable disabled hypervisors (if needed
|
|
and any available) in the case of the resource capacity provided by
|
|
active hypervisors is not able to accomodate all the load.
|
|
As the offload phase is later followed by the consolidation phase,
|
|
the hypervisor enabler in this phase doesn't necessarily results
|
|
in more enabled hypervisors in the final solution.
|
|
|
|
:param model: model_root object
|
|
:param cc: dictionary containing resource capacity coefficients
|
|
"""
|
|
sorted_hypervisors = sorted(
|
|
model.get_all_hypervisors().values(),
|
|
key=lambda x: self.get_hypervisor_utilization(x, model)['cpu'])
|
|
for hypervisor in reversed(sorted_hypervisors):
|
|
if self.is_overloaded(hypervisor, model, cc):
|
|
for vm in sorted(
|
|
model.get_mapping().get_node_vms(hypervisor),
|
|
key=lambda x: self.get_vm_utilization(
|
|
x, model)['cpu']
|
|
):
|
|
for dst_hypervisor in reversed(sorted_hypervisors):
|
|
if self.vm_fits(vm, dst_hypervisor, model, cc):
|
|
self.add_migration(vm, hypervisor,
|
|
dst_hypervisor, model)
|
|
break
|
|
if not self.is_overloaded(hypervisor, model, cc):
|
|
break
|
|
|
|
def consolidation_phase(self, model, cc):
|
|
"""Perform consolidation phase.
|
|
|
|
This considers provided resource capacity coefficients.
|
|
Consolidation phase performing first-fit based bin packing.
|
|
First, hypervisors with the lowest cpu utilization are consolidated
|
|
by moving their load to hypervisors with the highest cpu utilization
|
|
which can accomodate the load. In this phase the most cpu utilizied
|
|
VMs are prioritizied as their load is more difficult to accomodate
|
|
in the system than less cpu utilizied VMs which can be later used
|
|
to fill smaller CPU capacity gaps.
|
|
|
|
:param model: model_root object
|
|
:param cc: dictionary containing resource capacity coefficients
|
|
"""
|
|
sorted_hypervisors = sorted(
|
|
model.get_all_hypervisors().values(),
|
|
key=lambda x: self.get_hypervisor_utilization(x, model)['cpu'])
|
|
asc = 0
|
|
for hypervisor in sorted_hypervisors:
|
|
vms = sorted(model.get_mapping().get_node_vms(hypervisor),
|
|
key=lambda x: self.get_vm_utilization(x,
|
|
model)['cpu'])
|
|
for vm in reversed(vms):
|
|
dsc = len(sorted_hypervisors) - 1
|
|
for dst_hypervisor in reversed(sorted_hypervisors):
|
|
if asc >= dsc:
|
|
break
|
|
if self.vm_fits(vm, dst_hypervisor, model, cc):
|
|
self.add_migration(vm, hypervisor,
|
|
dst_hypervisor, model)
|
|
break
|
|
dsc -= 1
|
|
asc += 1
|
|
|
|
def pre_execute(self):
|
|
if not self.compute_model:
|
|
raise exception.ClusterStateNotDefined()
|
|
|
|
def do_execute(self):
|
|
"""Execute strategy.
|
|
|
|
This strategy produces a solution resulting in more
|
|
efficient utilization of cluster resources using following
|
|
four phases:
|
|
|
|
* Offload phase - handling over-utilized resources
|
|
* Consolidation phase - handling under-utilized resources
|
|
* Solution optimization - reducing number of migrations
|
|
* Disability of unused hypervisors
|
|
|
|
:param original_model: root_model object
|
|
"""
|
|
LOG.info(_LI('Executing Smart Strategy'))
|
|
model = self.compute_model.get_latest_cluster_data_model()
|
|
rcu = self.get_relative_cluster_utilization(model)
|
|
self.ceilometer_vm_data_cache = dict()
|
|
|
|
cc = {'cpu': 1.0, 'ram': 1.0, 'disk': 1.0}
|
|
|
|
# Offloading phase
|
|
self.offload_phase(model, cc)
|
|
|
|
# Consolidation phase
|
|
self.consolidation_phase(model, cc)
|
|
|
|
# Optimize solution
|
|
self.optimize_solution(model)
|
|
|
|
# disable unused hypervisors
|
|
self.disable_unused_hypervisors(model)
|
|
|
|
rcu_after = self.get_relative_cluster_utilization(model)
|
|
info = {
|
|
'number_of_migrations': self.number_of_migrations,
|
|
'number_of_released_hypervisors':
|
|
self.number_of_released_hypervisors,
|
|
'relative_cluster_utilization_before': str(rcu),
|
|
'relative_cluster_utilization_after': str(rcu_after)
|
|
}
|
|
|
|
LOG.debug(info)
|
|
|
|
def post_execute(self):
|
|
# self.solution.efficacy = rcu_after['cpu']
|
|
self.solution.set_efficacy_indicators(
|
|
released_compute_nodes_count=self.number_of_migrations,
|
|
vm_migrations_count=self.number_of_released_hypervisors,
|
|
)
|