Merge "Use KiB as unit for host_ram_usage when using prometheus datasource" into stable/2025.1

2025-06-27 16:59:14 +00:00
parent c0ebb8ddb3 fb85b27ae3
commit 3a923dbf16
5 changed files with 82 additions and 7 deletions
--- a/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml
+++ b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml
@@ -0,0 +1,14 @@
+---
+fixes:
+  - |
+    When running an audit with the `workload_stabilization` strategy with
+    `instance_ram_usage` metric in a deployment with prometheus datasource,
+    the host metric for the ram usage was wrongly reported with the incorrect
+    unit which lead to incorrect standard deviation and action plans due to the
+    application of the wrong scale factor in the algorithm.
+
+    The host ram usage metric is now properly reported in KB when using a
+    prometheus datasource and the strategy `workload_stabilization` calculates
+    the standard deviation properly.
+
+    For more details: https://launchpad.net/bugs/2113776
--- a/watcher/decision_engine/datasources/base.py
+++ b/watcher/decision_engine/datasources/base.py
@@ -178,7 +178,7 @@ class DataSourceBase(object):
                           granularity=None):
        """Get the ram usage for a host such as a compute_node

-        :return: ram usage as float in megabytes
+        :return: ram usage as float in kibibytes
        """
        pass

--- a/watcher/decision_engine/datasources/prometheus.py
+++ b/watcher/decision_engine/datasources/prometheus.py
@@ -276,7 +276,7 @@ class PrometheusHelper(base.DataSourceBase):
        (node_memory_MemTotal_bytes{instance='the_host'} -
        avg_over_time(
            node_memory_MemAvailable_bytes{instance='the_host'}[300s]))
-            / 1024 / 1024
+            / 1024

        So we take total and subtract available memory to determine
        how much is in use. We use the prometheus xxx_over_time functions
@@ -315,10 +315,11 @@ class PrometheusHelper(base.DataSourceBase):
                   'meter': meter, 'period': period}
            )
        elif meter == 'node_memory_MemAvailable_bytes':
+            # Prometheus metric is in B and we need to return KB
            query_args = (
                "(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
                "- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
-                "[%(period)ss])) / 1024 / 1024"
+                "[%(period)ss])) / 1024"
                % {'label': self.prometheus_fqdn_label,
                   'label_value': instance_label, 'agg': aggregate,
                   'meter': meter, 'period': period}
--- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
+++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
@@ -593,7 +593,7 @@ class TestPrometheusHelper(base.BaseTestCase):
        expected_query = (
            "(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
            "(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
        result = self.helper._build_prometheus_query(
            'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
        self.assertEqual(result, expected_query)
@@ -602,7 +602,7 @@ class TestPrometheusHelper(base.BaseTestCase):
        expected_query = (
            "(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
            "(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
        result = self.helper._build_prometheus_query(
            'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
        self.assertEqual(result, expected_query)
@@ -621,7 +621,7 @@ class TestPrometheusHelper(base.BaseTestCase):
        expected_query = (
            "(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
            "(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
        result = self.helper._build_prometheus_query(
            'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
        self.assertEqual(result, expected_query)
--- a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
@@ -212,11 +212,71 @@ class TestWorkloadStabilization(TestBaseStrategy):
            len(self.strategy.simulate_migrations(self.hosts_load_assert)))

    def test_check_threshold(self):
+
+        # sd for 0.05, 0.05, 0.07, 0.07, 0.8
+        test_cpu_sd = 0.296
+
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
-        self.strategy.thresholds = {'instance_cpu_usage': 0.001,
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
                                    'instance_ram_usage': 0.2}
+
        self.strategy.simulate_migrations = mock.Mock(return_value=True)
        self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_cpu_sd)
+
+    def test_check_threshold_cpu(self):
+
+        test_cpu_sd = 0.296
+
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_cpu_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25}
+
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_cpu_sd)
+
+    def test_check_threshold_ram(self):
+
+        # sd for 4,5,7,8, 29 MB used in 132MB total memory hosts
+        test_ram_sd = 0.071
+
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_ram_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
+                                    'instance_ram_usage': 0.05}
+
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_ram_sd)
+
+    def test_check_threshold_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.thresholds = {'instance_cpu_usage': 0.3,
+                                    'instance_ram_usage': 0.2}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())
+
+    def test_check_threshold_cpu_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_cpu_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.4}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())
+
+    def test_check_threshold_ram_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_ram_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
+                                    'instance_ram_usage': 0.1}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())

    def test_execute_one_migration(self):
        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()