From fb85b27ae3c60f414bd6fba59a1de88a61050b24 Mon Sep 17 00:00:00 2001
From: Alfredo Moralejo <amoralej@redhat.com>
Date: Tue, 10 Jun 2025 11:25:31 +0200
Subject: [PATCH] Use KiB as unit for host_ram_usage when using prometheus
 datasource

The prometheus datasource was reporting host_ram_usage in MiB as
described in the docstring for the base datasource interface
definition [1].

However, the gnocchi datasource is reporting it in KiB following
ceilometer metric `hardware.memory.used` [2] and the strategies
using that metric expect it to be in KiB so the best approach is
to change the unit in the prometheus datasource and update the
docstring to avoid missunderstandings in future. So, this patch
is fixing the prometheus datasource to return host_ram_usage
in KiB instead of MiB.

Additionally, it is adding more unit tests for the check_threshold
method so that it covers the memory based strategy execution, validates
the calculated standard deviation and adds the cases where it is below
the threshold.

[1] https://github.com/openstack/watcher/blob/15981117ee28627f235264e505e1e0d5956cf4e4/watcher/decision_engine/datasources/base.py#L177-L183
[2] https://docs.openstack.org/ceilometer/train/admin/telemetry-measurements.html#snmp-based-meters

Closes-Bug: #2113776
Change-Id: Idc060d1e709c0265c64ada16062c3a206c6b04fa
(cherry picked from commit 6ea362da0ba49dc58ded46647fc9a4d1befe7de0)
---
 .../notes/bug-2113776-4bd314fb46623fbc.yaml   | 14 +++++
 watcher/decision_engine/datasources/base.py   |  2 +-
 .../decision_engine/datasources/prometheus.py |  5 +-
 .../datasources/test_prometheus_helper.py     |  6 +-
 .../strategies/test_workload_stabilization.py | 62 ++++++++++++++++++-
 5 files changed, 82 insertions(+), 7 deletions(-)
 create mode 100644 releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml

diff --git a/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml
new file mode 100644
index 000000000..251b03502
--- /dev/null
+++ b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml
@@ -0,0 +1,14 @@
+---
+fixes:
+  - |
+    When running an audit with the `workload_stabilization` strategy with
+    `instance_ram_usage` metric in a deployment with prometheus datasource,
+    the host metric for the ram usage was wrongly reported with the incorrect
+    unit which lead to incorrect standard deviation and action plans due to the
+    application of the wrong scale factor in the algorithm.
+
+    The host ram usage metric is now properly reported in KB when using a
+    prometheus datasource and the strategy `workload_stabilization` calculates
+    the standard deviation properly.
+
+    For more details: https://launchpad.net/bugs/2113776
diff --git a/watcher/decision_engine/datasources/base.py b/watcher/decision_engine/datasources/base.py
index 63f03bcff..c196bece1 100644
--- a/watcher/decision_engine/datasources/base.py
+++ b/watcher/decision_engine/datasources/base.py
@@ -178,7 +178,7 @@ class DataSourceBase(object):
                            granularity=None):
         """Get the ram usage for a host such as a compute_node
 
-        :return: ram usage as float in megabytes
+        :return: ram usage as float in kibibytes
         """
         pass
 
diff --git a/watcher/decision_engine/datasources/prometheus.py b/watcher/decision_engine/datasources/prometheus.py
index 7b7572f3b..0f3e1a914 100644
--- a/watcher/decision_engine/datasources/prometheus.py
+++ b/watcher/decision_engine/datasources/prometheus.py
@@ -276,7 +276,7 @@ class PrometheusHelper(base.DataSourceBase):
         (node_memory_MemTotal_bytes{instance='the_host'} -
         avg_over_time(
             node_memory_MemAvailable_bytes{instance='the_host'}[300s]))
-            / 1024 / 1024
+            / 1024
 
         So we take total and subtract available memory to determine
         how much is in use. We use the prometheus xxx_over_time functions
@@ -315,10 +315,11 @@ class PrometheusHelper(base.DataSourceBase):
                    'meter': meter, 'period': period}
             )
         elif meter == 'node_memory_MemAvailable_bytes':
+            # Prometheus metric is in B and we need to return KB
             query_args = (
                 "(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
                 "- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
-                "[%(period)ss])) / 1024 / 1024"
+                "[%(period)ss])) / 1024"
                 % {'label': self.prometheus_fqdn_label,
                    'label_value': instance_label, 'agg': aggregate,
                    'meter': meter, 'period': period}
diff --git a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
index 5aaccb0ec..3e502729f 100644
--- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
+++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
@@ -593,7 +593,7 @@ class TestPrometheusHelper(base.BaseTestCase):
         expected_query = (
             "(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
             "(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
         result = self.helper._build_prometheus_query(
             'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
         self.assertEqual(result, expected_query)
@@ -602,7 +602,7 @@ class TestPrometheusHelper(base.BaseTestCase):
         expected_query = (
             "(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
             "(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
         result = self.helper._build_prometheus_query(
             'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
         self.assertEqual(result, expected_query)
@@ -621,7 +621,7 @@ class TestPrometheusHelper(base.BaseTestCase):
         expected_query = (
             "(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
             "(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
-            "/ 1024 / 1024")
+            "/ 1024")
         result = self.helper._build_prometheus_query(
             'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
         self.assertEqual(result, expected_query)
diff --git a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
index 8a442755b..90721c19a 100644
--- a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py
@@ -212,11 +212,71 @@ class TestWorkloadStabilization(TestBaseStrategy):
             len(self.strategy.simulate_migrations(self.hosts_load_assert)))
 
     def test_check_threshold(self):
+
+        # sd for 0.05, 0.05, 0.07, 0.07, 0.8
+        test_cpu_sd = 0.296
+
         self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
-        self.strategy.thresholds = {'instance_cpu_usage': 0.001,
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
                                     'instance_ram_usage': 0.2}
+
         self.strategy.simulate_migrations = mock.Mock(return_value=True)
         self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_cpu_sd)
+
+    def test_check_threshold_cpu(self):
+
+        test_cpu_sd = 0.296
+
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_cpu_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25}
+
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_cpu_sd)
+
+    def test_check_threshold_ram(self):
+
+        # sd for 4,5,7,8, 29 MB used in 132MB total memory hosts
+        test_ram_sd = 0.071
+
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_ram_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
+                                    'instance_ram_usage': 0.05}
+
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertTrue(self.strategy.check_threshold())
+        self.assertEqual(
+            round(self.strategy.sd_before_audit, 3),
+            test_ram_sd)
+
+    def test_check_threshold_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.thresholds = {'instance_cpu_usage': 0.3,
+                                    'instance_ram_usage': 0.2}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())
+
+    def test_check_threshold_cpu_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_cpu_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.4}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())
+
+    def test_check_threshold_ram_fail(self):
+        self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
+        self.strategy.metrics = ["instance_ram_usage"]
+        self.strategy.thresholds = {'instance_cpu_usage': 0.25,
+                                    'instance_ram_usage': 0.1}
+        self.strategy.simulate_migrations = mock.Mock(return_value=True)
+        self.assertFalse(self.strategy.check_threshold())
 
     def test_execute_one_migration(self):
         self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()