From 9dea55bd64faf66e4de4c3220d8581683db0bf7c Mon Sep 17 00:00:00 2001
From: "Chandan Kumar (raukadah)" <chkumar@redhat.com>
Date: Wed, 7 May 2025 08:55:11 +0530
Subject: [PATCH] Drop code from Host maintenance strategy migrating instance
 to disabled hosts

Currently host maintenance strategy also migrate instances from maintenance
node to watcher_disabled compute nodes.

watcher_disabled compute nodes might be disabled for some other purpose
by different strategy. If host maintenace use those compute nodes for
migration, It might affect customer workloads.

Host maintenance strategy should never touch disabled hosts unless the user
specify a disable host as backup node.

This cr drops the logic for using disabled compute node for maintenance.
Host maintaince is already using nova schedular for migrating the
instance, will use the same. If there is no available node, strategy
will fail.

Closes-Bug: #2109945

Change-Id: If9795fd06f684eb67d553405cebd8a30887c3997
Signed-off-by: Chandan Kumar (raukadah) <chkumar@redhat.com>
---
 ...gy-on-disabled-hosts-24084a22d4c8f914.yaml | 10 ++++++++
 .../strategy/strategies/host_maintenance.py   | 24 +------------------
 .../strategies/test_host_maintenance.py       |  5 +++-
 3 files changed, 15 insertions(+), 24 deletions(-)
 create mode 100644 releasenotes/notes/donot-run-host-migration-strategy-on-disabled-hosts-24084a22d4c8f914.yaml

diff --git a/releasenotes/notes/donot-run-host-migration-strategy-on-disabled-hosts-24084a22d4c8f914.yaml b/releasenotes/notes/donot-run-host-migration-strategy-on-disabled-hosts-24084a22d4c8f914.yaml
new file mode 100644
index 000000000..f2c91a385
--- /dev/null
+++ b/releasenotes/notes/donot-run-host-migration-strategy-on-disabled-hosts-24084a22d4c8f914.yaml
@@ -0,0 +1,10 @@
+---
+fixes:
+  - |
+    Host maintenance strategy should migrate servers based on backup node if specified
+    or rely on nova scheduler. It was enabling disabled hosts with watcher_disabled
+    reason and migrating servers to those nodes. It can impact customer workload. Compute
+    nodes were disabled for a reason.
+
+    Host maintenance strategy is fixed now to support migrating servers only on backup
+    node or rely on nova scheduler if no backup node is provided.
diff --git a/watcher/decision_engine/strategy/strategies/host_maintenance.py b/watcher/decision_engine/strategy/strategies/host_maintenance.py
index ddf4335d8..da694612d 100644
--- a/watcher/decision_engine/strategy/strategies/host_maintenance.py
+++ b/watcher/decision_engine/strategy/strategies/host_maintenance.py
@@ -53,7 +53,6 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
 
     INSTANCE_MIGRATION = "migrate"
     CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state"
-    REASON_FOR_DISABLE = 'watcher_disabled'
 
     def __init__(self, config, osc=None):
         super(HostMaintenance, self).__init__(config, osc)
@@ -95,10 +94,6 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
                 cn.status == element.ServiceState.DISABLED.value and
                 cn.disabled_reason == reason}
 
-    def get_disabled_compute_nodes(self):
-        return self.get_disabled_compute_nodes_with_reason(
-            self.REASON_FOR_DISABLE)
-
     def get_instance_state_str(self, instance):
         """Get instance state in string format"""
         if isinstance(instance.state, str):
@@ -215,8 +210,7 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
         """safe maintain one compute node
 
         Migrate all instances of the maintenance_node intensively to the
-        backup host. If the user didn't give the backup host, it will
-        select one unused node to backup the maintaining node.
+        backup host.
 
         It calculate the resource both of the backup node and maintaining
         node to evaluate the migrations from maintaining node to backup node.
@@ -233,22 +227,6 @@ class HostMaintenance(base.HostMaintenanceBaseStrategy):
                 self.host_migration(maintenance_node, backup_node)
                 return True
 
-        # If the user didn't give the backup host, select one unused
-        # node with required capacity, then migrates all instances
-        # from maintaining node to it.
-        nodes = sorted(
-            self.get_disabled_compute_nodes().values(),
-            key=lambda x: self.get_node_capacity(x)['cpu'])
-        if maintenance_node in nodes:
-            nodes.remove(maintenance_node)
-
-        for node in nodes:
-            if self.host_fits(maintenance_node, node):
-                self.enable_compute_node_if_disabled(node)
-                self.add_action_maintain_compute_node(maintenance_node)
-                self.host_migration(maintenance_node, node)
-                return True
-
         return False
 
     def try_maintain(self, maintenance_node):
diff --git a/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py b/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py
index f8080760c..d092c1501 100644
--- a/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py
+++ b/watcher/tests/decision_engine/strategy/strategies/test_host_maintenance.py
@@ -167,12 +167,15 @@ class TestHostMaintenance(TestBaseStrategy):
         node_1 = model.get_node_by_uuid('Node_1')
         self.assertFalse(self.strategy.safe_maintain(node_0))
         self.assertFalse(self.strategy.safe_maintain(node_1))
+        # It will return true, if backup node is passed
+        self.assertTrue(self.strategy.safe_maintain(node_0, node_1))
 
         model = self.fake_c_cluster.\
             generate_scenario_1_with_all_nodes_disable()
         self.m_c_model.return_value = model
         node_0 = model.get_node_by_uuid('Node_0')
-        self.assertTrue(self.strategy.safe_maintain(node_0))
+        # It will return false, if there is no backup node
+        self.assertFalse(self.strategy.safe_maintain(node_0))
 
     def test_try_maintain(self):
         model = self.fake_c_cluster.generate_scenario_1()