From fb85b27ae3c60f414bd6fba59a1de88a61050b24 Mon Sep 17 00:00:00 2001 From: Alfredo Moralejo Date: Tue, 10 Jun 2025 11:25:31 +0200 Subject: [PATCH] Use KiB as unit for host_ram_usage when using prometheus datasource The prometheus datasource was reporting host_ram_usage in MiB as described in the docstring for the base datasource interface definition [1]. However, the gnocchi datasource is reporting it in KiB following ceilometer metric `hardware.memory.used` [2] and the strategies using that metric expect it to be in KiB so the best approach is to change the unit in the prometheus datasource and update the docstring to avoid missunderstandings in future. So, this patch is fixing the prometheus datasource to return host_ram_usage in KiB instead of MiB. Additionally, it is adding more unit tests for the check_threshold method so that it covers the memory based strategy execution, validates the calculated standard deviation and adds the cases where it is below the threshold. [1] https://github.com/openstack/watcher/blob/15981117ee28627f235264e505e1e0d5956cf4e4/watcher/decision_engine/datasources/base.py#L177-L183 [2] https://docs.openstack.org/ceilometer/train/admin/telemetry-measurements.html#snmp-based-meters Closes-Bug: #2113776 Change-Id: Idc060d1e709c0265c64ada16062c3a206c6b04fa (cherry picked from commit 6ea362da0ba49dc58ded46647fc9a4d1befe7de0) --- .../notes/bug-2113776-4bd314fb46623fbc.yaml | 14 +++++ watcher/decision_engine/datasources/base.py | 2 +- .../decision_engine/datasources/prometheus.py | 5 +- .../datasources/test_prometheus_helper.py | 6 +- .../strategies/test_workload_stabilization.py | 62 ++++++++++++++++++- 5 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml diff --git a/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml new file mode 100644 index 000000000..251b03502 --- /dev/null +++ b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml @@ -0,0 +1,14 @@ +--- +fixes: + - | + When running an audit with the `workload_stabilization` strategy with + `instance_ram_usage` metric in a deployment with prometheus datasource, + the host metric for the ram usage was wrongly reported with the incorrect + unit which lead to incorrect standard deviation and action plans due to the + application of the wrong scale factor in the algorithm. + + The host ram usage metric is now properly reported in KB when using a + prometheus datasource and the strategy `workload_stabilization` calculates + the standard deviation properly. + + For more details: https://launchpad.net/bugs/2113776 diff --git a/watcher/decision_engine/datasources/base.py b/watcher/decision_engine/datasources/base.py index 63f03bcff..c196bece1 100644 --- a/watcher/decision_engine/datasources/base.py +++ b/watcher/decision_engine/datasources/base.py @@ -178,7 +178,7 @@ class DataSourceBase(object): granularity=None): """Get the ram usage for a host such as a compute_node - :return: ram usage as float in megabytes + :return: ram usage as float in kibibytes """ pass diff --git a/watcher/decision_engine/datasources/prometheus.py b/watcher/decision_engine/datasources/prometheus.py index 7b7572f3b..0f3e1a914 100644 --- a/watcher/decision_engine/datasources/prometheus.py +++ b/watcher/decision_engine/datasources/prometheus.py @@ -276,7 +276,7 @@ class PrometheusHelper(base.DataSourceBase): (node_memory_MemTotal_bytes{instance='the_host'} - avg_over_time( node_memory_MemAvailable_bytes{instance='the_host'}[300s])) - / 1024 / 1024 + / 1024 So we take total and subtract available memory to determine how much is in use. We use the prometheus xxx_over_time functions @@ -315,10 +315,11 @@ class PrometheusHelper(base.DataSourceBase): 'meter': meter, 'period': period} ) elif meter == 'node_memory_MemAvailable_bytes': + # Prometheus metric is in B and we need to return KB query_args = ( "(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} " "- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}" - "[%(period)ss])) / 1024 / 1024" + "[%(period)ss])) / 1024" % {'label': self.prometheus_fqdn_label, 'label_value': instance_label, 'agg': aggregate, 'meter': meter, 'period': period} diff --git a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py index 5aaccb0ec..3e502729f 100644 --- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py +++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py @@ -593,7 +593,7 @@ class TestPrometheusHelper(base.BaseTestCase): expected_query = ( "(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time" "(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) " - "/ 1024 / 1024") + "/ 1024") result = self.helper._build_prometheus_query( 'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555') self.assertEqual(result, expected_query) @@ -602,7 +602,7 @@ class TestPrometheusHelper(base.BaseTestCase): expected_query = ( "(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time" "(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) " - "/ 1024 / 1024") + "/ 1024") result = self.helper._build_prometheus_query( 'min', 'node_memory_MemAvailable_bytes', 'd_host', '222') self.assertEqual(result, expected_query) @@ -621,7 +621,7 @@ class TestPrometheusHelper(base.BaseTestCase): expected_query = ( "(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time" "(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) " - "/ 1024 / 1024") + "/ 1024") result = self.helper._build_prometheus_query( 'min', 'node_memory_MemAvailable_bytes', 'd_host', '222') self.assertEqual(result, expected_query) diff --git a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py index 8a442755b..90721c19a 100644 --- a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py +++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py @@ -212,11 +212,71 @@ class TestWorkloadStabilization(TestBaseStrategy): len(self.strategy.simulate_migrations(self.hosts_load_assert))) def test_check_threshold(self): + + # sd for 0.05, 0.05, 0.07, 0.07, 0.8 + test_cpu_sd = 0.296 + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() - self.strategy.thresholds = {'instance_cpu_usage': 0.001, + self.strategy.thresholds = {'instance_cpu_usage': 0.25, 'instance_ram_usage': 0.2} + self.strategy.simulate_migrations = mock.Mock(return_value=True) self.assertTrue(self.strategy.check_threshold()) + self.assertEqual( + round(self.strategy.sd_before_audit, 3), + test_cpu_sd) + + def test_check_threshold_cpu(self): + + test_cpu_sd = 0.296 + + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.metrics = ["instance_cpu_usage"] + self.strategy.thresholds = {'instance_cpu_usage': 0.25} + + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertTrue(self.strategy.check_threshold()) + self.assertEqual( + round(self.strategy.sd_before_audit, 3), + test_cpu_sd) + + def test_check_threshold_ram(self): + + # sd for 4,5,7,8, 29 MB used in 132MB total memory hosts + test_ram_sd = 0.071 + + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.metrics = ["instance_ram_usage"] + self.strategy.thresholds = {'instance_cpu_usage': 0.25, + 'instance_ram_usage': 0.05} + + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertTrue(self.strategy.check_threshold()) + self.assertEqual( + round(self.strategy.sd_before_audit, 3), + test_ram_sd) + + def test_check_threshold_fail(self): + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.thresholds = {'instance_cpu_usage': 0.3, + 'instance_ram_usage': 0.2} + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertFalse(self.strategy.check_threshold()) + + def test_check_threshold_cpu_fail(self): + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.metrics = ["instance_cpu_usage"] + self.strategy.thresholds = {'instance_cpu_usage': 0.4} + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertFalse(self.strategy.check_threshold()) + + def test_check_threshold_ram_fail(self): + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.metrics = ["instance_ram_usage"] + self.strategy.thresholds = {'instance_cpu_usage': 0.25, + 'instance_ram_usage': 0.1} + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertFalse(self.strategy.check_threshold()) def test_execute_one_migration(self): self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()