diff --git a/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml new file mode 100644 index 000000000..251b03502 --- /dev/null +++ b/releasenotes/notes/bug-2113776-4bd314fb46623fbc.yaml @@ -0,0 +1,14 @@ +--- +fixes: + - | + When running an audit with the `workload_stabilization` strategy with + `instance_ram_usage` metric in a deployment with prometheus datasource, + the host metric for the ram usage was wrongly reported with the incorrect + unit which lead to incorrect standard deviation and action plans due to the + application of the wrong scale factor in the algorithm. + + The host ram usage metric is now properly reported in KB when using a + prometheus datasource and the strategy `workload_stabilization` calculates + the standard deviation properly. + + For more details: https://launchpad.net/bugs/2113776 diff --git a/watcher/decision_engine/datasources/base.py b/watcher/decision_engine/datasources/base.py index 63f03bcff..c196bece1 100644 --- a/watcher/decision_engine/datasources/base.py +++ b/watcher/decision_engine/datasources/base.py @@ -178,7 +178,7 @@ class DataSourceBase(object): granularity=None): """Get the ram usage for a host such as a compute_node - :return: ram usage as float in megabytes + :return: ram usage as float in kibibytes """ pass diff --git a/watcher/decision_engine/datasources/prometheus.py b/watcher/decision_engine/datasources/prometheus.py index 7b7572f3b..0f3e1a914 100644 --- a/watcher/decision_engine/datasources/prometheus.py +++ b/watcher/decision_engine/datasources/prometheus.py @@ -276,7 +276,7 @@ class PrometheusHelper(base.DataSourceBase): (node_memory_MemTotal_bytes{instance='the_host'} - avg_over_time( node_memory_MemAvailable_bytes{instance='the_host'}[300s])) - / 1024 / 1024 + / 1024 So we take total and subtract available memory to determine how much is in use. We use the prometheus xxx_over_time functions @@ -315,10 +315,11 @@ class PrometheusHelper(base.DataSourceBase): 'meter': meter, 'period': period} ) elif meter == 'node_memory_MemAvailable_bytes': + # Prometheus metric is in B and we need to return KB query_args = ( "(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} " "- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}" - "[%(period)ss])) / 1024 / 1024" + "[%(period)ss])) / 1024" % {'label': self.prometheus_fqdn_label, 'label_value': instance_label, 'agg': aggregate, 'meter': meter, 'period': period} diff --git a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py index 5aaccb0ec..3e502729f 100644 --- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py +++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py @@ -593,7 +593,7 @@ class TestPrometheusHelper(base.BaseTestCase): expected_query = ( "(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time" "(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) " - "/ 1024 / 1024") + "/ 1024") result = self.helper._build_prometheus_query( 'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555') self.assertEqual(result, expected_query) @@ -602,7 +602,7 @@ class TestPrometheusHelper(base.BaseTestCase): expected_query = ( "(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time" "(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) " - "/ 1024 / 1024") + "/ 1024") result = self.helper._build_prometheus_query( 'min', 'node_memory_MemAvailable_bytes', 'd_host', '222') self.assertEqual(result, expected_query) @@ -621,7 +621,7 @@ class TestPrometheusHelper(base.BaseTestCase): expected_query = ( "(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time" "(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) " - "/ 1024 / 1024") + "/ 1024") result = self.helper._build_prometheus_query( 'min', 'node_memory_MemAvailable_bytes', 'd_host', '222') self.assertEqual(result, expected_query) diff --git a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py index 8a442755b..90721c19a 100644 --- a/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py +++ b/watcher/tests/decision_engine/strategy/strategies/test_workload_stabilization.py @@ -212,11 +212,71 @@ class TestWorkloadStabilization(TestBaseStrategy): len(self.strategy.simulate_migrations(self.hosts_load_assert))) def test_check_threshold(self): + + # sd for 0.05, 0.05, 0.07, 0.07, 0.8 + test_cpu_sd = 0.296 + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() - self.strategy.thresholds = {'instance_cpu_usage': 0.001, + self.strategy.thresholds = {'instance_cpu_usage': 0.25, 'instance_ram_usage': 0.2} + self.strategy.simulate_migrations = mock.Mock(return_value=True) self.assertTrue(self.strategy.check_threshold()) + self.assertEqual( + round(self.strategy.sd_before_audit, 3), + test_cpu_sd) + + def test_check_threshold_cpu(self): + + test_cpu_sd = 0.296 + + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.metrics = ["instance_cpu_usage"] + self.strategy.thresholds = {'instance_cpu_usage': 0.25} + + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertTrue(self.strategy.check_threshold()) + self.assertEqual( + round(self.strategy.sd_before_audit, 3), + test_cpu_sd) + + def test_check_threshold_ram(self): + + # sd for 4,5,7,8, 29 MB used in 132MB total memory hosts + test_ram_sd = 0.071 + + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.metrics = ["instance_ram_usage"] + self.strategy.thresholds = {'instance_cpu_usage': 0.25, + 'instance_ram_usage': 0.05} + + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertTrue(self.strategy.check_threshold()) + self.assertEqual( + round(self.strategy.sd_before_audit, 3), + test_ram_sd) + + def test_check_threshold_fail(self): + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.thresholds = {'instance_cpu_usage': 0.3, + 'instance_ram_usage': 0.2} + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertFalse(self.strategy.check_threshold()) + + def test_check_threshold_cpu_fail(self): + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.metrics = ["instance_cpu_usage"] + self.strategy.thresholds = {'instance_cpu_usage': 0.4} + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertFalse(self.strategy.check_threshold()) + + def test_check_threshold_ram_fail(self): + self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1() + self.strategy.metrics = ["instance_ram_usage"] + self.strategy.thresholds = {'instance_cpu_usage': 0.25, + 'instance_ram_usage': 0.1} + self.strategy.simulate_migrations = mock.Mock(return_value=True) + self.assertFalse(self.strategy.check_threshold()) def test_execute_one_migration(self): self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()