outlet Temperature based migration strategy

It implements one of the algorithm of Intel thermal POC.
It extends the BaseStrategy class, getting the Outlet
metrics of servers via Ceilometer, and generates solutions
when the outlet Temperature is greater than threshold.
current threshold is hard-coded, will make it configurable
in the next patches.

Implements: blueprint outlet-temperature-based-strategy

Change-Id: I248147329d34eddf408652205a077895be572010
Co-Authored-By: Zhenzan Zhou <zhenzan.zhou@intel.com>
This commit is contained in:
junjie huang
2015-12-14 00:09:57 +00:00
parent 660c782626
commit 8ebc898924
8 changed files with 440 additions and 27 deletions

View File

@@ -0,0 +1,251 @@
# -*- encoding: utf-8 -*-
# Copyright (c) 2015 Intel Corp
#
# Authors: Junjie-Huang <junjie.huang@intel.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from oslo_log import log
from watcher._i18n import _LE
from watcher.common import exception as wexc
from watcher.decision_engine.actions.migration import Migrate
from watcher.decision_engine.actions.migration import MigrationType
from watcher.decision_engine.model.resource import ResourceType
from watcher.decision_engine.model.vm_state import VMState
from watcher.decision_engine.strategy.common.level import StrategyLevel
from watcher.decision_engine.strategy.strategies.base import BaseStrategy
from watcher.metrics_engine.cluster_history.ceilometer import \
CeilometerClusterHistory
LOG = log.getLogger(__name__)
class OutletTempControl(BaseStrategy):
DEFAULT_NAME = "outlet_temp_control"
DEFAULT_DESCRIPTION = "outlet temperature based migration strategy"
# The meter to report outlet temperature in ceilometer
METER_NAME = "hardware.ipmi.node.outlet_temperature"
# Unit: degree C
THRESHOLD = 35.0
def __init__(self, name=DEFAULT_NAME, description=DEFAULT_DESCRIPTION):
"""[PoC]Outlet temperature control using live migration
It is a migration strategy based on the Outlet Temperature of physical
servers. It generates solutions to move a workload whenever a servers
outlet temperature is higher than the specified threshold. As of now,
we cannot forecast how many instances should be migrated. This is the
reason why we simply plan a single virtual machine migration.
So it's better to use this algorithm with CONTINUOUS audits.
Requirements:
* Hardware: computer node should support IPMI and PTAS technology
* Software: Ceilometer component ceilometer-agent-ipmi running
in each compute node, and Ceilometer API can report such telemetry
"hardware.ipmi.node.outlet_temperature" successfully.
* You must have at least 2 physical compute nodes to run this strategy.
Good Strategy:
Towards to software defined infrastructure, the power and thermal
intelligences is being adopted to optimize workload, which can help
improve efficiency, reduce power, as well as to improve datacenter PUE
and lower down operation cost in data center.
Outlet(Exhaust Air) Temperature is one of the important thermal
telemetries to measure thermal/workload status of server.
:param name: the name of the strategy
:param description: a description of the strategy
"""
super(OutletTempControl, self).__init__(name, description)
# the migration plan will be triggered when the outlet temperature
# reaches threshold
# TODO(zhenzanz): Threshold should be configurable for each audit
self.threshold = self.THRESHOLD
self._meter = self.METER_NAME
self._ceilometer = None
@property
def ceilometer(self):
if self._ceilometer is None:
self._ceilometer = CeilometerClusterHistory()
return self._ceilometer
@ceilometer.setter
def ceilometer(self, c):
self._ceilometer = c
def calc_used_res(self, model, hypervisor, cap_cores, cap_mem, cap_disk):
'''calculate the used vcpus, memory and disk based on VM flavors'''
vms = model.get_mapping().get_node_vms(hypervisor)
vcpus_used = 0
memory_mb_used = 0
disk_gb_used = 0
if len(vms) > 0:
for vm_id in vms:
vm = model.get_vm_from_id(vm_id)
vcpus_used += cap_cores.get_capacity(vm)
memory_mb_used += cap_mem.get_capacity(vm)
disk_gb_used += cap_disk.get_capacity(vm)
return vcpus_used, memory_mb_used, disk_gb_used
def group_hosts_by_outlet_temp(self, model):
'''Group hosts based on outlet temp meters'''
hypervisors = model.get_all_hypervisors()
size_cluster = len(hypervisors)
if size_cluster == 0:
raise wexc.ClusterEmpty()
hosts_need_release = []
hosts_target = []
for hypervisor_id in hypervisors:
hypervisor = model.get_hypervisor_from_id(hypervisor_id)
resource_id = hypervisor.uuid
outlet_temp = self.ceilometer.statistic_aggregation(
resource_id=resource_id,
meter_name=self._meter,
period="30",
aggregate='avg')
# some hosts may not have outlet temp meters, remove from target
if outlet_temp is None:
LOG.warning(_LE("%s: no outlet temp data"), resource_id)
continue
LOG.debug("%s: outlet temperature %f" % (resource_id, outlet_temp))
hvmap = {'hv': hypervisor, 'outlet_temp': outlet_temp}
if outlet_temp >= self.threshold:
# mark the hypervisor to release resources
hosts_need_release.append(hvmap)
else:
hosts_target.append(hvmap)
return hosts_need_release, hosts_target
def choose_vm_to_migrate(self, model, hosts):
'''pick up an active vm instance to migrate from provided hosts'''
for hvmap in hosts:
mig_src_hypervisor = hvmap['hv']
vms_of_src = model.get_mapping().get_node_vms(mig_src_hypervisor)
if len(vms_of_src) > 0:
for vm_id in vms_of_src:
try:
# select the first active VM to migrate
vm = model.get_vm_from_id(vm_id)
if vm.state != VMState.ACTIVE.value:
LOG.info(_LE("VM not active, skipped: %s"),
vm.uuid)
continue
return (mig_src_hypervisor, vm)
except wexc.VMNotFound as e:
LOG.info("VM not found Error: %s" % e.message)
pass
return None
def filter_dest_servers(self, model, hosts, vm_to_migrate):
'''Only return hosts with sufficient available resources'''
cap_cores = model.get_resource_from_id(ResourceType.cpu_cores)
cap_disk = model.get_resource_from_id(ResourceType.disk)
cap_mem = model.get_resource_from_id(ResourceType.memory)
required_cores = cap_cores.get_capacity(vm_to_migrate)
required_disk = cap_disk.get_capacity(vm_to_migrate)
required_mem = cap_mem.get_capacity(vm_to_migrate)
# filter hypervisors without enough resource
dest_servers = []
for hvmap in hosts:
host = hvmap['hv']
# available
cores_used, mem_used, disk_used = self.calc_used_res(model,
host,
cap_cores,
cap_mem,
cap_disk)
cores_available = cap_cores.get_capacity(host) - cores_used
disk_available = cap_disk.get_capacity(host) - mem_used
mem_available = cap_mem.get_capacity(host) - disk_used
if cores_available >= required_cores and \
disk_available >= required_disk and \
mem_available >= required_mem:
dest_servers.append(hvmap)
return dest_servers
def execute(self, orign_model):
LOG.debug("Initializing Outlet temperature strategy")
if orign_model is None:
raise wexc.ClusterStateNotDefined()
current_model = orign_model
hosts_need_release, hosts_target = self.group_hosts_by_outlet_temp(
current_model)
if len(hosts_need_release) == 0:
# TODO(zhenzanz): return something right if there's no hot servers
LOG.debug("No hosts require optimization")
return self.solution
if len(hosts_target) == 0:
LOG.warning(_LE("No hosts under outlet temp threshold found"))
return self.solution
# choose the server with highest outlet t
hosts_need_release = sorted(hosts_need_release,
reverse=True,
key=lambda x: (x["outlet_temp"]))
vm_to_migrate = self.choose_vm_to_migrate(current_model,
hosts_need_release)
# calculate the vm's cpu cores,memory,disk needs
if vm_to_migrate is None:
return self.solution
mig_src_hypervisor, vm_src = vm_to_migrate
dest_servers = self.filter_dest_servers(current_model,
hosts_target,
vm_src)
# sort the filtered result by outlet temp
# pick up the lowest one as dest server
if len(dest_servers) == 0:
# TODO(zhenzanz): maybe to warn that there's no resource
# for instance.
LOG.info(_LE("No proper target host could be found"))
return self.solution
dest_servers = sorted(dest_servers,
reverse=False,
key=lambda x: (x["outlet_temp"]))
# always use the host with lowerest outlet temperature
mig_dst_hypervisor = dest_servers[0]['hv']
# generate solution to migrate the vm to the dest server,
if current_model.get_mapping().migrate_vm(vm_src,
mig_src_hypervisor,
mig_dst_hypervisor):
live_migrate = Migrate(vm_src,
mig_src_hypervisor,
mig_dst_hypervisor)
live_migrate.migration_type = MigrationType.pre_copy
live_migrate.level = StrategyLevel.conservative
self.solution.add_change_request(live_migrate)
self.solution.model = current_model
return self.solution