Use KiB as unit for host_ram_usage when using prometheus datasource

The prometheus datasource was reporting host_ram_usage in MiB as described in the docstring for the base datasource interface definition [1]. However, the gnocchi datasource is reporting it in KiB following ceilometer metric `hardware.memory.used` [2] and the strategies using that metric expect it to be in KiB so the best approach is to change the unit in the prometheus datasource and update the docstring to avoid missunderstandings in future. So, this patch is fixing the prometheus datasource to return host_ram_usage in KiB instead of MiB. Additionally, it is adding more unit tests for the check_threshold method so that it covers the memory based strategy execution, validates the calculated standard deviation and adds the cases where it is below the threshold. [1] 15981117ee/watcher/decision_engine/datasources/base.py (L177-L183) [2] https://docs.openstack.org/ceilometer/train/admin/telemetry-measurements.html#snmp-based-meters Closes-Bug: #2113776 Change-Id: Idc060d1e709c0265c64ada16062c3a206c6b04fa (cherry picked from commit 6ea362da0b)
2025-06-10 11:25:31 +02:00
parent f4bfb10525
commit fb85b27ae3
5 changed files with 82 additions and 7 deletions
--- a/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml
+++ b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml
@@ -0,0 +1,14 @@
+---
+fixes:
+  - |
+    When running an audit with the `workload_stabilization` strategy with
+    `instance_ram_usage` metric in a deployment with prometheus datasource,
+    the host metric for the ram usage was wrongly reported with the incorrect
+    unit which lead to incorrect standard deviation and action plans due to the
+    application of the wrong scale factor in the algorithm.
+
+    The host ram usage metric is now properly reported in KB when using a
+    prometheus datasource and the strategy `workload_stabilization` calculates
+    the standard deviation properly.
+
+    For more details: https://launchpad.net/bugs/2113776
--- a/watcher/decision_engine/datasources/base.py
+++ b/watcher/decision_engine/datasources/base.py
@@ -178,7 +178,7 @@ class DataSourceBase(object):
                           granularity=None):
        """Get the ram usage for a host such as a compute_node

-        :return: ram usage as float in megabytes
+        :return: ram usage as float in kibibytes
        """
        pass

--- a/watcher/decision_engine/datasources/prometheus.py
+++ b/watcher/decision_engine/datasources/prometheus.py
@@ -276,7 +276,7 @@ class PrometheusHelper(base.DataSourceBase):
        (node_memory_MemTotal_bytes{instance='the_host'} -
        avg_over_time(
            node_memory_MemAvailable_bytes{instance='the_host'}[300s]))
-            / 1024 / 1024
+            / 1024

        So we take total and subtract available memory to determine
        how much is in use. We use the prometheus xxx_over_time functions
@@ -315,10 +315,11 @@ class PrometheusHelper(base.DataSourceBase):
                   'meter': meter, 'period': period}
            )
        elif meter == 'node_memory_MemAvailable_bytes':
+            # Prometheus metric is in B and we need to return KB
            query_args = (
                "(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
                "- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
-                "[%(period)ss])) / 1024 / 1024"
+                "[%(period)ss])) / 1024"
                % {'label': self.prometheus_fqdn_label,
                   'label_value': instance_label, 'agg': aggregate,
                   'meter': meter, 'period': period}
--- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
+++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
@@ -593,7 +593,7 @@ class TestPrometheusHelper(base.BaseTestCase):
        expected_query = (
            "(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
            "(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
        result = self.helper._build_prometheus_query(
            'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
        self.assertEqual(result, expected_query)
@@ -602,7 +602,7 @@ class TestPrometheusHelper(base.BaseTestCase):
        expected_query = (
            "(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
            "(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
        result = self.helper._build_prometheus_query(
            'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
        self.assertEqual(result, expected_query)
@@ -621,7 +621,7 @@ class TestPrometheusHelper(base.BaseTestCase):
        expected_query = (
            "(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
            "(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
        result = self.helper._build_prometheus_query(
            'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
        self.assertEqual(result, expected_query)
--- a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
@@ -212,11 +212,71 @@ class TestWorkloadStabilization(TestBaseStrategy):
            len(self.strategy.simulate_migrations(self.hosts_load_assert)))

    def test_check_threshold(self):
+
+        # sd for 0.05, 0.05, 0.07, 0.07, 0.8
+        test_cpu_sd = 0.296
+
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
-        self.strategy.thresholds = {'instance_cpu_usage': 0.001,
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
                                    'instance_ram_usage': 0.2}
+
        self.strategy.simulate_migrations = mock.Mock(return_value=True)
        self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_cpu_sd)
+
+    def test_check_threshold_cpu(self):
+
+        test_cpu_sd = 0.296
+
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_cpu_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25}
+
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_cpu_sd)
+
+    def test_check_threshold_ram(self):
+
+        # sd for 4,5,7,8, 29 MB used in 132MB total memory hosts
+        test_ram_sd = 0.071
+
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_ram_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
+                                    'instance_ram_usage': 0.05}
+
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_ram_sd)
+
+    def test_check_threshold_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.thresholds = {'instance_cpu_usage': 0.3,
+                                    'instance_ram_usage': 0.2}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())
+
+    def test_check_threshold_cpu_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_cpu_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.4}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())
+
+    def test_check_threshold_ram_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_ram_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
+                                    'instance_ram_usage': 0.1}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())

    def test_execute_one_migration(self):
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()