From 1529e3faddf27ee4e0bdab8fd932f7e9f2aedeaa Mon Sep 17 00:00:00 2001
From: Alfredo Moralejo <amoralej@redhat.com>
Date: Tue, 3 Jun 2025 11:59:14 +0200
Subject: [PATCH] Add debug message to report calculated metric for
 workload_balance

The workload_balance strategy calculates host metrics based on the
instance metrics and those are the ones used to compare with the
threshold.

Currently, the strategy does not reports the calculated values what
makes difficult to troubleshoot sometimes. This patch is adding a debug
message to log those values.

This patch is also adding a new unit test for filter_destination_hosts
based on ram instead of cpu and adding assertions for the new debug
messages. To implement properly the new test, I had to sligthly modify
the ram usage fixtures used for the workload_balance tests.

Change-Id: Ief5e167afcf346ff53471f26adc70795c4b69f68
---
 .../strategy/strategies/workload_balance.py   | 38 +++++++++++++----
 .../decision_engine/model/gnocchi_metrics.py  |  4 +-
 .../strategies/test_workload_balance.py       | 42 ++++++++++++++++++-
 3 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/watcher/decision_engine/strategy/strategies/workload_balance.py b/watcher/decision_engine/strategy/strategies/workload_balance.py
index 2bc51f3f5..880fa4ba1 100644
--- a/watcher/decision_engine/strategy/strategies/workload_balance.py
+++ b/watcher/decision_engine/strategy/strategies/workload_balance.py
@@ -192,15 +192,30 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
             if (free_res['vcpu'] >= required_cores and
                     free_res['memory'] >= required_mem and
                     free_res['disk'] >= required_disk):
-                if (self._meter == 'instance_cpu_usage' and
-                    ((src_instance_workload + workload) <
-                     self.threshold / 100 * host.vcpus)):
-                    destination_hosts.append(instance_data)
-                if (self._meter == 'instance_ram_usage' and
-                    ((src_instance_workload + workload) <
-                     self.threshold / 100 * host.memory)):
-                    destination_hosts.append(instance_data)
-
+                if self._meter == 'instance_cpu_usage':
+                    usage = src_instance_workload + workload
+                    usage_percent = usage / host.vcpus * 100
+                    limit = self.threshold / 100 * host.vcpus
+                    if usage < limit:
+                        destination_hosts.append(instance_data)
+                    LOG.debug(f"Host {host.hostname} evaluated as destination "
+                              f"for {instance_to_migrate.uuid}. Host usage "
+                              f"for cpu would be {usage_percent}."
+                              f"The threshold is: {self.threshold}. "
+                              f"selected: {usage < limit}"
+                              )
+                if self._meter == 'instance_ram_usage':
+                    usage = src_instance_workload + workload
+                    usage_percent = usage / host.memory * 100
+                    limit = self.threshold / 100 * host.memory
+                    if usage < limit:
+                        destination_hosts.append(instance_data)
+                    LOG.debug(f"Host {host.hostname} evaluated as destination "
+                              f"for {instance_to_migrate.uuid}. Host usage "
+                              f"for ram would be {usage_percent}."
+                              f"The threshold is: {self.threshold}. "
+                              f"selected: {usage < limit}"
+                              )
         return destination_hosts
 
     def group_hosts_by_cpu_or_ram_util(self):
@@ -251,8 +266,10 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
             cluster_workload += node_workload
             if self._meter == 'instance_cpu_usage':
                 node_util = node_workload / node.vcpus * 100
+                host_metric = 'host_cpu_usage_percent'
             else:
                 node_util = node_workload / node.memory * 100
+                host_metric = 'host_ram_usage_percent'
 
             instance_data = {
                 'compute_node': node, self._meter: node_util,
@@ -262,6 +279,9 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
                 overload_hosts.append(instance_data)
             else:
                 nonoverload_hosts.append(instance_data)
+            LOG.debug(f"Host usage for {node_id}: {host_metric} is "
+                      f"{node_util}. Higher than threshold {self.threshold}: "
+                      f"{node_util >= self.threshold}")
 
         avg_workload = 0
         if cluster_size != 0:
diff --git a/watcher/tests/decision_engine/model/gnocchi_metrics.py b/watcher/tests/decision_engine/model/gnocchi_metrics.py
index f5a2df73f..1ad7141f4 100644
--- a/watcher/tests/decision_engine/model/gnocchi_metrics.py
+++ b/watcher/tests/decision_engine/model/gnocchi_metrics.py
@@ -304,8 +304,8 @@ class FakeGnocchiMetrics(object):
         mock = {}
 
         # node 0
-        mock['INSTANCE_1'] = 30
-        mock['73b09e16-35b7-4922-804e-e8f5d9b740fc'] = 12
+        mock['INSTANCE_1'] = 40
+        mock['73b09e16-35b7-4922-804e-e8f5d9b740fc'] = 9
         # node 1
         mock['INSTANCE_3'] = 12
         mock['INSTANCE_4'] = 12
diff --git a/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py b/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py
index ac49b4c9a..3f197b54b 100644
--- a/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_balance.py
@@ -22,6 +22,7 @@ from unittest import mock
 from watcher.applier.loading import default
 from watcher.common import utils
 from watcher.decision_engine.strategy import strategies
+from watcher.decision_engine.strategy.strategies import workload_balance
 from watcher.tests.decision_engine.model import gnocchi_metrics
 from watcher.tests.decision_engine.strategy.strategies.test_base \
     import TestBaseStrategy
@@ -77,7 +78,7 @@ class TestWorkloadBalance(TestBaseStrategy):
         n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
         self.assertEqual(n1[0]['compute_node'].uuid, 'Node_0')
         self.assertEqual(n2[0]['compute_node'].uuid, 'Node_1')
-        self.assertEqual(avg, 33.0)
+        self.assertEqual(avg, 36.5)
 
     def test_choose_instance_to_migrate(self):
         model = self.fake_c_cluster.generate_scenario_6_with_2_nodes()
@@ -99,7 +100,8 @@ class TestWorkloadBalance(TestBaseStrategy):
             n1, avg, w_map)
         self.assertIsNone(instance_to_mig)
 
-    def test_filter_destination_hosts(self):
+    @mock.patch.object(workload_balance.LOG, 'debug', autospec=True)
+    def test_filter_destination_hosts_cpu(self, mock_debug):
         model = self.fake_c_cluster.generate_scenario_6_with_2_nodes()
         self.m_c_model.return_value = model
         self.strategy.datasource = mock.MagicMock(
@@ -111,6 +113,42 @@ class TestWorkloadBalance(TestBaseStrategy):
             n2, instance_to_mig[1], avg, w_map)
         self.assertEqual(len(dest_hosts), 1)
         self.assertEqual(dest_hosts[0]['compute_node'].uuid, 'Node_1')
+        expected_calls = [
+            mock.call('Host usage for Node_0: host_cpu_usage_percent is 32.5. '
+                      'Higher than threshold 25.0: True'),
+            mock.call('Host usage for Node_1: host_cpu_usage_percent is 7.5. '
+                      'Higher than threshold 25.0: False'),
+            mock.call('Host hostname_1 evaluated as destination for '
+                      '73b09e16-35b7-4922-804e-e8f5d9b740fc. Host usage for '
+                      'cpu would be 20.0.The threshold is: 25.0. selected: '
+                      'True')]
+        mock_debug.assert_has_calls(expected_calls, any_order=True)
+
+    @mock.patch.object(workload_balance.LOG, 'debug', autospec=True)
+    def test_filter_destination_hosts_ram(self, mock_debug):
+        model = self.fake_c_cluster.generate_scenario_6_with_2_nodes()
+        self.m_c_model.return_value = model
+        self.strategy._meter = 'instance_ram_usage'
+        self.strategy.threshold = 30.0
+        self.strategy.datasource = mock.MagicMock(
+            statistic_aggregation=self.fake_metrics.mock_get_statistics_wb)
+        n1, n2, avg, w_map = self.strategy.group_hosts_by_cpu_or_ram_util()
+        instance_to_mig = self.strategy.choose_instance_to_migrate(
+            n1, avg, w_map)
+        dest_hosts = self.strategy.filter_destination_hosts(
+            n2, instance_to_mig[1], avg, w_map)
+        self.assertEqual(len(dest_hosts), 1)
+        self.assertEqual(dest_hosts[0]['compute_node'].uuid, 'Node_1')
+        expected_calls = [
+            mock.call('Host usage for Node_0: host_ram_usage_percent is '
+                      '37.121212121212125. Higher than threshold 30.0: True'),
+            mock.call('Host usage for Node_1: host_ram_usage_percent is '
+                      '18.181818181818183. Higher than threshold 30.0: False'),
+            mock.call('Host hostname_1 evaluated as destination for '
+                      '73b09e16-35b7-4922-804e-e8f5d9b740fc. Host usage for '
+                      'ram would be 25.0.The threshold is: 30.0. selected: '
+                      'True')]
+        mock_debug.assert_has_calls(expected_calls, any_order=True)
 
     def test_execute_no_workload(self):
         model = self.fake_c_cluster.\