Merge "Use KiB as unit for host_ram_usage when using prometheus datasource" into stable/2025.1

This commit is contained in:
Zuul
2025-06-27 16:59:14 +00:00
committed by Gerrit Code Review
5 changed files with 82 additions and 7 deletions

View File

@@ -0,0 +1,14 @@
---
fixes:
- |
When running an audit with the `workload_stabilization` strategy with
`instance_ram_usage` metric in a deployment with prometheus datasource,
the host metric for the ram usage was wrongly reported with the incorrect
unit which lead to incorrect standard deviation and action plans due to the
application of the wrong scale factor in the algorithm.
The host ram usage metric is now properly reported in KB when using a
prometheus datasource and the strategy `workload_stabilization` calculates
the standard deviation properly.
For more details: https://launchpad.net/bugs/2113776

View File

@@ -178,7 +178,7 @@ class DataSourceBase(object):
granularity=None):
"""Get the ram usage for a host such as a compute_node
:return: ram usage as float in megabytes
:return: ram usage as float in kibibytes
"""
pass

View File

@@ -276,7 +276,7 @@ class PrometheusHelper(base.DataSourceBase):
(node_memory_MemTotal_bytes{instance='the_host'} -
avg_over_time(
node_memory_MemAvailable_bytes{instance='the_host'}[300s]))
/ 1024 / 1024
/ 1024
So we take total and subtract available memory to determine
how much is in use. We use the prometheus xxx_over_time functions
@@ -315,10 +315,11 @@ class PrometheusHelper(base.DataSourceBase):
'meter': meter, 'period': period}
)
elif meter == 'node_memory_MemAvailable_bytes':
# Prometheus metric is in B and we need to return KB
query_args = (
"(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
"- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
"[%(period)ss])) / 1024 / 1024"
"[%(period)ss])) / 1024"
% {'label': self.prometheus_fqdn_label,
'label_value': instance_label, 'agg': aggregate,
'meter': meter, 'period': period}

View File

@@ -593,7 +593,7 @@ class TestPrometheusHelper(base.BaseTestCase):
expected_query = (
"(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
"(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
"/ 1024 / 1024")
"/ 1024")
result = self.helper._build_prometheus_query(
'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
self.assertEqual(result, expected_query)
@@ -602,7 +602,7 @@ class TestPrometheusHelper(base.BaseTestCase):
expected_query = (
"(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
"/ 1024 / 1024")
"/ 1024")
result = self.helper._build_prometheus_query(
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
self.assertEqual(result, expected_query)
@@ -621,7 +621,7 @@ class TestPrometheusHelper(base.BaseTestCase):
expected_query = (
"(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
"/ 1024 / 1024")
"/ 1024")
result = self.helper._build_prometheus_query(
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
self.assertEqual(result, expected_query)

View File

@@ -212,11 +212,71 @@ class TestWorkloadStabilization(TestBaseStrategy):
len(self.strategy.simulate_migrations(self.hosts_load_assert)))
def test_check_threshold(self):
# sd for 0.05, 0.05, 0.07, 0.07, 0.8
test_cpu_sd = 0.296
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.thresholds = {'instance_cpu_usage': 0.001,
self.strategy.thresholds = {'instance_cpu_usage': 0.25,
'instance_ram_usage': 0.2}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertTrue(self.strategy.check_threshold())
self.assertEqual(
round(self.strategy.sd_before_audit, 3),
test_cpu_sd)
def test_check_threshold_cpu(self):
test_cpu_sd = 0.296
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.metrics = ["instance_cpu_usage"]
self.strategy.thresholds = {'instance_cpu_usage': 0.25}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertTrue(self.strategy.check_threshold())
self.assertEqual(
round(self.strategy.sd_before_audit, 3),
test_cpu_sd)
def test_check_threshold_ram(self):
# sd for 4,5,7,8, 29 MB used in 132MB total memory hosts
test_ram_sd = 0.071
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.metrics = ["instance_ram_usage"]
self.strategy.thresholds = {'instance_cpu_usage': 0.25,
'instance_ram_usage': 0.05}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertTrue(self.strategy.check_threshold())
self.assertEqual(
round(self.strategy.sd_before_audit, 3),
test_ram_sd)
def test_check_threshold_fail(self):
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.thresholds = {'instance_cpu_usage': 0.3,
'instance_ram_usage': 0.2}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertFalse(self.strategy.check_threshold())
def test_check_threshold_cpu_fail(self):
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.metrics = ["instance_cpu_usage"]
self.strategy.thresholds = {'instance_cpu_usage': 0.4}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertFalse(self.strategy.check_threshold())
def test_check_threshold_ram_fail(self):
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.metrics = ["instance_ram_usage"]
self.strategy.thresholds = {'instance_cpu_usage': 0.25,
'instance_ram_usage': 0.1}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertFalse(self.strategy.check_threshold())
def test_execute_one_migration(self):
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()