watcher/watcher/decision_engine/datasources/gnocchi.py

# -*- encoding: utf-8 -*-
# Copyright (c) 2017 Servionica
#
# Authors: Alexander Chadin <a.chadin@servionica.ru>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import datetime
from datetime import timedelta

from gnocchiclient import exceptions as gnc_exc
from oslo_config import cfg
from oslo_log import log

from watcher.common import clients
from watcher.decision_engine.datasources import base

CONF = cfg.CONF
LOG = log.getLogger(__name__)


class GnocchiHelper(base.DataSourceBase):

    NAME = 'gnocchi'
    METRIC_MAP = dict(host_cpu_usage='compute.node.cpu.percent',
                      host_ram_usage='hardware.memory.used',
                      host_outlet_temp='hardware.ipmi.node.outlet_temperature',
                      host_inlet_temp='hardware.ipmi.node.temperature',
                      host_airflow='hardware.ipmi.node.airflow',
                      host_power='hardware.ipmi.node.power',
                      instance_cpu_usage='cpu',
                      instance_ram_usage='memory.resident',
                      instance_ram_allocated='memory',
                      instance_l3_cache_usage='cpu_l3_cache',
                      instance_root_disk_size='disk.root.size',
                      )

    def __init__(self, osc=None):
        """:param osc: an OpenStackClients instance"""
        self.osc = osc if osc else clients.OpenStackClients()
        self.gnocchi = self.osc.gnocchi()

    def check_availability(self):
        status = self.query_retry(self.gnocchi.status.get)
        if status:
            return 'available'
        else:
            return 'not available'

    def list_metrics(self):
        """List the user's meters."""
        response = self.query_retry(f=self.gnocchi.metric.list)
        if not response:
            return set()
        else:
            return set([metric['name'] for metric in response])

    def statistic_aggregation(self, resource=None, resource_type=None,
                              meter_name=None, period=300, aggregate='mean',
                              granularity=300):
        stop_time = datetime.utcnow()
        start_time = stop_time - timedelta(seconds=(int(period)))

        meter = self._get_meter(meter_name)

        if aggregate == 'count':
            aggregate = 'mean'
            LOG.warning('aggregate type count not supported by gnocchi,'
                        ' replaced with mean.')

        resource_id = resource.uuid
        if resource_type == 'compute_node':
            resource_id = "%s_%s" % (resource.hostname, resource.hostname)
            kwargs = dict(query={"=": {"original_resource_id": resource_id}},
                          limit=1)
            resources = self.query_retry(
                f=self.gnocchi.resource.search,
                ignored_exc=gnc_exc.NotFound,
                **kwargs)

            if not resources:
                LOG.warning("The {0} resource {1} could not be "
                            "found".format(self.NAME, resource_id))
                return

            resource_id = resources[0]['id']

        if meter_name == "instance_cpu_usage":
            if resource_type != "instance":
                LOG.warning("Unsupported resource type for metric "
                            "'instance_cpu_usage': ", resource_type)
                return

            # The "cpu_util" gauge (percentage) metric has been removed.
            # We're going to obtain the same result by using the rate of change
            # aggregate operation.
            if aggregate not in ("mean", "rate:mean"):
                LOG.warning("Unsupported aggregate for instance_cpu_usage "
                            "metric: %s. "
                            "Supported aggregates: mean, rate:mean ",
                            aggregate)
                return

            # TODO(lpetrut): consider supporting other aggregates.
            aggregate = "rate:mean"

        raw_kwargs = dict(
            metric=meter,
            start=start_time,
            stop=stop_time,
            resource_id=resource_id,
            granularity=granularity,
            aggregation=aggregate,
        )

        kwargs = {k: v for k, v in raw_kwargs.items() if k and v}

        statistics = self.query_retry(
            f=self.gnocchi.metric.get_measures,
            ignored_exc=gnc_exc.NotFound,
            **kwargs)

        return_value = None
        if statistics:
            # return value of latest measure
            # measure has structure [time, granularity, value]
            return_value = statistics[-1][2]

            if meter_name == 'host_airflow':
                # Airflow from hardware.ipmi.node.airflow is reported as
                # 1/10 th of actual CFM
                return_value *= 10
            if meter_name == "instance_cpu_usage":
                # "rate:mean" can return negative values for migrated vms.
                return_value = max(0, return_value)

                # We're converting the cumulative cpu time (ns) to cpu usage
                # percentage.
                vcpus = resource.vcpus
                if not vcpus:
                    LOG.warning("instance vcpu count not set, assuming 1")
                    vcpus = 1
                return_value *= 100 / (granularity * 10e+8) / vcpus

        return return_value

    def statistic_series(self, resource=None, resource_type=None,
                         meter_name=None, start_time=None, end_time=None,
                         granularity=300):

        meter = self._get_meter(meter_name)

        resource_id = resource.uuid
        if resource_type == 'compute_node':
            resource_id = "%s_%s" % (resource.hostname, resource.hostname)
            kwargs = dict(query={"=": {"original_resource_id": resource_id}},
                          limit=1)
            resources = self.query_retry(
                f=self.gnocchi.resource.search,
                ignored_exc=gnc_exc.NotFound,
                **kwargs)

            if not resources:
                LOG.warning("The {0} resource {1} could not be "
                            "found".format(self.NAME, resource_id))
                return

            resource_id = resources[0]['id']

        raw_kwargs = dict(
            metric=meter,
            start=start_time,
            stop=end_time,
            resource_id=resource_id,
            granularity=granularity,
        )

        kwargs = {k: v for k, v in raw_kwargs.items() if k and v}

        statistics = self.query_retry(
            f=self.gnocchi.metric.get_measures,
            ignored_exc=gnc_exc.NotFound,
            **kwargs)

        return_value = None
        if statistics:
            # measure has structure [time, granularity, value]
            if meter_name == 'host_airflow':
                # Airflow from hardware.ipmi.node.airflow is reported as
                # 1/10 th of actual CFM
                return_value = {s[0]: s[2]*10 for s in statistics}
            else:
                return_value = {s[0]: s[2] for s in statistics}

        return return_value

    def get_host_cpu_usage(self, resource, period, aggregate,
                           granularity=300):

        return self.statistic_aggregation(
            resource, 'compute_node', 'host_cpu_usage', period,
            aggregate, granularity)

    def get_host_ram_usage(self, resource, period, aggregate,
                           granularity=300):

        return self.statistic_aggregation(
            resource, 'compute_node', 'host_ram_usage', period,
            aggregate, granularity)

    def get_host_outlet_temp(self, resource, period, aggregate,
                             granularity=300):

        return self.statistic_aggregation(
            resource, 'compute_node', 'host_outlet_temp', period,
            aggregate, granularity)

    def get_host_inlet_temp(self, resource, period, aggregate,
                            granularity=300):

        return self.statistic_aggregation(
            resource, 'compute_node', 'host_inlet_temp', period,
            aggregate, granularity)

    def get_host_airflow(self, resource, period, aggregate,
                         granularity=300):

        return self.statistic_aggregation(
            resource, 'compute_node', 'host_airflow', period,
            aggregate, granularity)

    def get_host_power(self, resource, period, aggregate,
                       granularity=300):

        return self.statistic_aggregation(
            resource, 'compute_node', 'host_power', period,
            aggregate, granularity)

    def get_instance_cpu_usage(self, resource, period, aggregate,
                               granularity=300):

        return self.statistic_aggregation(
            resource, 'instance', 'instance_cpu_usage', period,
            aggregate, granularity)

    def get_instance_ram_usage(self, resource, period, aggregate,
                               granularity=300):

        return self.statistic_aggregation(
            resource, 'instance', 'instance_ram_usage', period,
            aggregate, granularity)

    def get_instance_ram_allocated(self, resource, period, aggregate,
                                   granularity=300):

        return self.statistic_aggregation(
            resource, 'instance', 'instance_ram_allocated', period,
            aggregate, granularity)

    def get_instance_l3_cache_usage(self, resource, period, aggregate,
                                    granularity=300):

        return self.statistic_aggregation(
            resource, 'instance', 'instance_l3_cache_usage', period,
            aggregate, granularity)

    def get_instance_root_disk_size(self, resource, period, aggregate,
                                    granularity=300):

        return self.statistic_aggregation(
            resource, 'instance', 'instance_root_disk_size', period,
            aggregate, granularity)