Use KiB as unit for host_ram_usage when using prometheus datasource

The prometheus datasource was reporting host_ram_usage in MiB as
described in the docstring for the base datasource interface
definition [1].

However, the gnocchi datasource is reporting it in KiB following
ceilometer metric `hardware.memory.used` [2] and the strategies
using that metric expect it to be in KiB so the best approach is
to change the unit in the prometheus datasource and update the
docstring to avoid missunderstandings in future. So, this patch
is fixing the prometheus datasource to return host_ram_usage
in KiB instead of MiB.

Additionally, it is adding more unit tests for the check_threshold
method so that it covers the memory based strategy execution, validates
the calculated standard deviation and adds the cases where it is below
the threshold.

[1] 15981117ee/watcher/decision_engine/datasources/base.py (L177-L183)
[2] https://docs.openstack.org/ceilometer/train/admin/telemetry-measurements.html#snmp-based-meters

Closes-Bug: #2113776
Change-Id: Idc060d1e709c0265c64ada16062c3a206c6b04fa
This commit is contained in:
Alfredo Moralejo
2025-06-10 11:25:31 +02:00
parent 15981117ee
commit 6ea362da0b
5 changed files with 82 additions and 7 deletions

View File

@@ -0,0 +1,14 @@
---
fixes:
- |
When running an audit with the `workload_stabilization` strategy with
`instance_ram_usage` metric in a deployment with prometheus datasource,
the host metric for the ram usage was wrongly reported with the incorrect
unit which lead to incorrect standard deviation and action plans due to the
application of the wrong scale factor in the algorithm.
The host ram usage metric is now properly reported in KB when using a
prometheus datasource and the strategy `workload_stabilization` calculates
the standard deviation properly.
For more details: https://launchpad.net/bugs/2113776

View File

@@ -178,7 +178,7 @@ class DataSourceBase(object):
granularity=None):
"""Get the ram usage for a host such as a compute_node
:return: ram usage as float in megabytes
:return: ram usage as float in kibibytes
"""
pass

View File

@@ -276,7 +276,7 @@ class PrometheusHelper(base.DataSourceBase):
(node_memory_MemTotal_bytes{instance='the_host'} -
avg_over_time(
node_memory_MemAvailable_bytes{instance='the_host'}[300s]))
/ 1024 / 1024
/ 1024
So we take total and subtract available memory to determine
how much is in use. We use the prometheus xxx_over_time functions
@@ -315,10 +315,11 @@ class PrometheusHelper(base.DataSourceBase):
'meter': meter, 'period': period}
)
elif meter == 'node_memory_MemAvailable_bytes':
# Prometheus metric is in B and we need to return KB
query_args = (
"(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
"- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
"[%(period)ss])) / 1024 / 1024"
"[%(period)ss])) / 1024"
% {'label': self.prometheus_fqdn_label,
'label_value': instance_label, 'agg': aggregate,
'meter': meter, 'period': period}

View File

@@ -593,7 +593,7 @@ class TestPrometheusHelper(base.BaseTestCase):
expected_query = (
"(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
"(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
"/ 1024 / 1024")
"/ 1024")
result = self.helper._build_prometheus_query(
'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
self.assertEqual(result, expected_query)
@@ -602,7 +602,7 @@ class TestPrometheusHelper(base.BaseTestCase):
expected_query = (
"(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
"/ 1024 / 1024")
"/ 1024")
result = self.helper._build_prometheus_query(
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
self.assertEqual(result, expected_query)
@@ -621,7 +621,7 @@ class TestPrometheusHelper(base.BaseTestCase):
expected_query = (
"(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
"/ 1024 / 1024")
"/ 1024")
result = self.helper._build_prometheus_query(
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
self.assertEqual(result, expected_query)

View File

@@ -212,11 +212,71 @@ class TestWorkloadStabilization(TestBaseStrategy):
len(self.strategy.simulate_migrations(self.hosts_load_assert)))
def test_check_threshold(self):
# sd for 0.05, 0.05, 0.07, 0.07, 0.8
test_cpu_sd = 0.296
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.thresholds = {'instance_cpu_usage': 0.001,
self.strategy.thresholds = {'instance_cpu_usage': 0.25,
'instance_ram_usage': 0.2}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertTrue(self.strategy.check_threshold())
self.assertEqual(
round(self.strategy.sd_before_audit, 3),
test_cpu_sd)
def test_check_threshold_cpu(self):
test_cpu_sd = 0.296
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.metrics = ["instance_cpu_usage"]
self.strategy.thresholds = {'instance_cpu_usage': 0.25}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertTrue(self.strategy.check_threshold())
self.assertEqual(
round(self.strategy.sd_before_audit, 3),
test_cpu_sd)
def test_check_threshold_ram(self):
# sd for 4,5,7,8, 29 MB used in 132MB total memory hosts
test_ram_sd = 0.071
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.metrics = ["instance_ram_usage"]
self.strategy.thresholds = {'instance_cpu_usage': 0.25,
'instance_ram_usage': 0.05}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertTrue(self.strategy.check_threshold())
self.assertEqual(
round(self.strategy.sd_before_audit, 3),
test_ram_sd)
def test_check_threshold_fail(self):
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.thresholds = {'instance_cpu_usage': 0.3,
'instance_ram_usage': 0.2}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertFalse(self.strategy.check_threshold())
def test_check_threshold_cpu_fail(self):
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.metrics = ["instance_cpu_usage"]
self.strategy.thresholds = {'instance_cpu_usage': 0.4}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertFalse(self.strategy.check_threshold())
def test_check_threshold_ram_fail(self):
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()
self.strategy.metrics = ["instance_ram_usage"]
self.strategy.thresholds = {'instance_cpu_usage': 0.25,
'instance_ram_usage': 0.1}
self.strategy.simulate_migrations = mock.Mock(return_value=True)
self.assertFalse(self.strategy.check_threshold())
def test_execute_one_migration(self):
self.m_c_model.return_value = self.fake_c_cluster.generate_scenario_1()