Merge "Query by fqdn_label instead of instance for host metrics"

This commit is contained in:
Zuul
2025-03-20 12:50:28 +00:00
committed by Gerrit Code Review
4 changed files with 173 additions and 90 deletions

View File

@@ -29,19 +29,19 @@ This default can be overridden when a deployer uses a different label to
identify the exporter host (for example ``hostname`` or ``host``, or any other identify the exporter host (for example ``hostname`` or ``host``, or any other
label, as long as it identifies the host). label, as long as it identifies the host).
Internally this label is used in creating a ``fqdn_instance_map``, mapping Internally this label is used in creating ``fqdn_instance_labels``, containing
the fqdn with the Prometheus instance label associated with each exporter. the list of values assigned to the the label in the Prometheus targets.
The keys of the resulting fqdn_instance_map are expected to match the The elements of the resulting fqdn_instance_labels are expected to match the
``ComputeNode.hostname`` used in the Watcher decision engine cluster model. ``ComputeNode.hostname`` used in the Watcher decision engine cluster model.
An example ``fqdn_instance_map`` is the following: An example ``fqdn_instance_labels`` is the following:
.. code-block:: .. code-block::
{ [
'ena.controlplane.domain': '10.1.2.1:9100', 'ena.controlplane.domain',
'dio.controlplane.domain': '10.1.2.2:9100', 'dio.controlplane.domain',
'tria.controlplane.domain': '10.1.2.3:9100' 'tria.controlplane.domain',
} ]
For instance metrics, it is required that Prometheus contains a label For instance metrics, it is required that Prometheus contains a label
with the uuid of the OpenStack instance in each relevant metric. By default, with the uuid of the OpenStack instance in each relevant metric. By default,

View File

@@ -0,0 +1,9 @@
---
fixes:
- |
When using prometheus datasource and more that one target has the same value
for the `fqdn_label`, the driver used the wrong instance label to query for host
metrics. The `instance` label is no longer used in the queries but the `fqdn_label`
which identifies all the metrics for a specific compute node.
.. _Bug 2103451: https://bugs.launchpad.net/watcher/+bug/2103451

View File

@@ -51,10 +51,10 @@ class PrometheusHelper(base.DataSourceBase):
The prometheus helper uses the PrometheusAPIClient provided by The prometheus helper uses the PrometheusAPIClient provided by
python-observabilityclient. python-observabilityclient.
The prometheus_fqdn_instance_map maps the fqdn of each node to the The prometheus_fqdn_labels contains a list the values contained in
Prometheus instance label added to all metrics on that node. When the fqdn_label in the Prometheus instance. When making queries to
making queries to Prometheus we use the instance label to specify Prometheus we use the fqdn_label to specify the node for which
the node for which metrics are to be retrieved. metrics are to be retrieved.
host, port and fqdn_label come from watcher_client host, port and fqdn_label come from watcher_client
config. The prometheus_fqdn_label allows override of the required label config. The prometheus_fqdn_label allows override of the required label
in Prometheus scrape configs that specifies each target's fqdn. in Prometheus scrape configs that specifies each target's fqdn.
@@ -63,8 +63,8 @@ class PrometheusHelper(base.DataSourceBase):
self.prometheus_fqdn_label = ( self.prometheus_fqdn_label = (
CONF.prometheus_client.fqdn_label CONF.prometheus_client.fqdn_label
) )
self.prometheus_fqdn_instance_map = ( self.prometheus_fqdn_labels = (
self._build_prometheus_fqdn_instance_map() self._build_prometheus_fqdn_labels()
) )
self.prometheus_host_instance_map = ( self.prometheus_host_instance_map = (
self._build_prometheus_host_instance_map() self._build_prometheus_host_instance_map()
@@ -136,73 +136,71 @@ class PrometheusHelper(base.DataSourceBase):
return the_client return the_client
def _build_prometheus_fqdn_instance_map(self): def _build_prometheus_fqdn_labels(self):
"""Build the fqdn<-->instance_label mapping needed for queries """Build the list of fqdn_label values to be used in host queries
Watcher knows nodes by their hostname. In Prometheus however the Watcher knows nodes by their hostname. In Prometheus however the
scrape targets (also known as 'instances') are specified by I.P. scrape targets (also known as 'instances') are specified by I.P.
(or hostname) and port number. This function creates a mapping between (or hostname) and port number and fqdn is stored in a custom 'fqdn'
the fully qualified domain name of each node and the corresponding label added to Prometheus scrape_configs. Operators can use a
instance label used in the scrape config. This relies on a custom different custom label instead by setting the prometheus_fqdn_label
'fqdn' label added to Prometheus scrape_configs. Operators can use
a different custom label instead by setting the prometheus_fqdn_label
config option under the prometheus_client section of watcher config. config option under the prometheus_client section of watcher config.
The built prometheus_fqdn_instance_map is used to match watcher The built prometheus_fqdn_labels is created with the full list
node.hostname if watcher stores fqdn and otherwise the of values of the prometheus_fqdn_label label in Prometheus. This will
host_instance_map is used instead. be used to create a map of hostname<-->fqdn and to identify if a target
:return a dict mapping fqdn to instance label. For example: exist in prometheus for the compute nodes before sending the query.
{'marios-env-again.controlplane.domain': '10.1.2.3:9100'} :return a set of values of the fqdn label. For example:
{'foo.example.com', 'bar.example.com'}
{'foo', 'bar'}
""" """
prometheus_targets = self.prometheus._get( prometheus_targets = self.prometheus._get(
"targets?state=active")['data']['activeTargets'] "targets?state=active")['data']['activeTargets']
# >>> prometheus_targets[0]['labels'] # >>> prometheus_targets[0]['labels']
# {'fqdn': 'marios-env-again.controlplane.domain', # {'fqdn': 'marios-env-again.controlplane.domain',
# 'instance': 'localhost:9100', 'job': 'node'} # 'instance': 'localhost:9100', 'job': 'node'}
fqdn_instance_map = { fqdn_instance_labels = set()
fqdn: instance for (fqdn, instance) in ( for target in prometheus_targets:
(target['labels'].get(self.prometheus_fqdn_label), if target.get('labels', {}).get(self.prometheus_fqdn_label):
target['labels'].get('instance')) fqdn_instance_labels.add(
for target in prometheus_targets target['labels'].get(self.prometheus_fqdn_label))
if target.get('labels', {}).get(self.prometheus_fqdn_label)
) if not fqdn_instance_labels:
}
if not fqdn_instance_map:
LOG.error( LOG.error(
"Could not create fqdn instance map from Prometheus " "Could not create fqdn labels list from Prometheus "
"targets config. Prometheus returned the following: %s", "targets config. Prometheus returned the following: %s",
prometheus_targets prometheus_targets
) )
return {} return set()
return fqdn_instance_map return fqdn_instance_labels
def _build_prometheus_host_instance_map(self): def _build_prometheus_host_instance_map(self):
"""Build the hostname<-->instance_label mapping needed for queries """Build the hostname<-->instance_label mapping needed for queries
The prometheus_fqdn_instance_map has the fully qualified domain name The prometheus_fqdn_labels has the fully qualified domain name
for hosts. This will create a duplicate map containing only the host for hosts. This will create a duplicate map containing only the host
name part. Depending on the watcher node.hostname either the name part. Depending on the watcher node.hostname either the
fqdn_instance_map or the host_instance_map will be used to resolve fqdn_instance_labels or the host_instance_map will be used to resolve
the correct prometheus instance label for queries. In the event the the correct prometheus fqdn_label for queries. In the event the
fqdn_instance_map keys are not valid fqdn (for example it contains fqdn_instance_labels elements are not valid fqdn (for example it has
hostnames, not fqdn) the host_instance_map cannot be created and hostnames, not fqdn) the host_instance_map cannot be created and
an empty dictionary is returned with a warning logged. an empty dictionary is returned with a warning logged.
:return a dict mapping hostname to instance label. For example: :return a dict mapping hostname to instance label. For example:
{'marios-env-again': 'localhost:9100'} {'foo': 'foo.example.com', 'bar': 'bar.example.com'}
""" """
if not self.prometheus_fqdn_instance_map: if not self.prometheus_fqdn_labels:
LOG.error("Cannot build host_instance_map without " LOG.error("Cannot build host_instance_map without "
"fqdn_instance_map") "fqdn_instance_labels")
return {} return {}
host_instance_map = { host_instance_map = {
host: instance for (host, instance) in ( host: fqdn for (host, fqdn) in (
(fqdn.split('.')[0], inst) (fqdn.split('.')[0], fqdn)
for fqdn, inst in self.prometheus_fqdn_instance_map.items() for fqdn in self.prometheus_fqdn_labels
if '.' in fqdn if '.' in fqdn
) )
} }
if not host_instance_map: if not host_instance_map:
LOG.warning("Creating empty host instance map. Are the keys " LOG.warning("Creating empty host instance map. Are the keys "
"in prometheus_fqdn_instance_map valid fqdn?") "in prometheus_fqdn_labels valid fqdn?")
return {} return {}
return host_instance_map return host_instance_map
@@ -210,23 +208,25 @@ class PrometheusHelper(base.DataSourceBase):
"""Resolve the prometheus instance label to use in queries """Resolve the prometheus instance label to use in queries
Given the watcher node.hostname, resolve the prometheus instance Given the watcher node.hostname, resolve the prometheus instance
label for use in queries, first trying the fqdn_instance_map and label for use in queries, first trying the fqdn_instance_labels and
then the host_instance_map (watcher.node_name can be fqdn or hostname). then the host_instance_map (watcher.node_name can be fqdn or hostname).
If the name is not resolved after the first attempt, rebuild the fqdn If the name is not resolved after the first attempt, rebuild the fqdn
and host instance maps and try again. This allows for new hosts added and host instance maps and try again. This allows for new hosts added
after the initialisation of the fqdn_instance_map. after the initialisation of the fqdn_instance_labels.
:param node_name: the watcher node.hostname :param node_name: the watcher node.hostname
:return String for the prometheus instance label and None if not found :return String for the prometheus instance label and None if not found
""" """
def _query_maps(node): def _query_maps(node):
return self.prometheus_fqdn_instance_map.get( if node in self.prometheus_fqdn_labels:
node, self.prometheus_host_instance_map.get(node, None)) return node
else:
return self.prometheus_host_instance_map.get(node, None)
instance_label = _query_maps(node_name) instance_label = _query_maps(node_name)
# refresh the fqdn and host instance maps and retry # refresh the fqdn and host instance maps and retry
if not instance_label: if not instance_label:
self.prometheus_fqdn_instance_map = ( self.prometheus_fqdn_labels = (
self._build_prometheus_fqdn_instance_map() self._build_prometheus_fqdn_labels()
) )
self.prometheus_host_instance_map = ( self.prometheus_host_instance_map = (
self._build_prometheus_host_instance_map() self._build_prometheus_host_instance_map()
@@ -307,17 +307,21 @@ class PrometheusHelper(base.DataSourceBase):
if meter == 'node_cpu_seconds_total': if meter == 'node_cpu_seconds_total':
query_args = ( query_args = (
"100 - (%s by (instance)(rate(%s" "100 - (%(agg)s by (instance)(rate(%(meter)s"
"{mode='idle',instance='%s'}[%ss])) * 100)" % "{mode='idle',%(label)s='%(label_value)s'}[%(period)ss])) "
(aggregate, meter, instance_label, period) "* 100)"
% {'label': self.prometheus_fqdn_label,
'label_value': instance_label, 'agg': aggregate,
'meter': meter, 'period': period}
) )
elif meter == 'node_memory_MemAvailable_bytes': elif meter == 'node_memory_MemAvailable_bytes':
query_args = ( query_args = (
"(node_memory_MemTotal_bytes{instance='%s'} " "(node_memory_MemTotal_bytes{%(label)s='%(label_value)s'} "
"- %s_over_time(%s{instance='%s'}[%ss])) " "- %(agg)s_over_time(%(meter)s{%(label)s='%(label_value)s'}"
"/ 1024 / 1024" % "[%(period)ss])) / 1024 / 1024"
(instance_label, aggregate, meter, % {'label': self.prometheus_fqdn_label,
instance_label, period) 'label_value': instance_label, 'agg': aggregate,
'meter': meter, 'period': period}
) )
elif meter == 'ceilometer_memory_usage': elif meter == 'ceilometer_memory_usage':
query_args = ( query_args = (

View File

@@ -147,7 +147,8 @@ class TestPrometheusHelper(base.BaseTestCase):
self.assertEqual(expected_cpu_usage, result) self.assertEqual(expected_cpu_usage, result)
mock_prometheus_query.assert_called_once_with( mock_prometheus_query.assert_called_once_with(
"100 - (avg by (instance)(rate(node_cpu_seconds_total" "100 - (avg by (instance)(rate(node_cpu_seconds_total"
"{mode='idle',instance='10.0.1.2:9100'}[300s])) * 100)") "{mode='idle',fqdn='marios-env.controlplane.domain'}[300s]))"
" * 100)")
@mock.patch.object(prometheus_client.PrometheusAPIClient, 'query') @mock.patch.object(prometheus_client.PrometheusAPIClient, 'query')
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get') @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
@@ -437,15 +438,48 @@ class TestPrometheusHelper(base.BaseTestCase):
'instance': '10.1.2.3:9100', 'job': 'node', 'instance': '10.1.2.3:9100', 'job': 'node',
}}, }},
]}} ]}}
expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100', expected_fqdn_list = {'foo.controlplane.domain',
'bar.controlplane.domain': '10.1.2.2:9100', 'bar.controlplane.domain',
'baz.controlplane.domain': '10.1.2.3:9100'} 'baz.controlplane.domain'}
expected_host_map = {'foo': '10.1.2.1:9100', expected_host_map = {'foo': 'foo.controlplane.domain',
'bar': '10.1.2.2:9100', 'bar': 'bar.controlplane.domain',
'baz': '10.1.2.3:9100'} 'baz': 'baz.controlplane.domain'}
helper = prometheus_helper.PrometheusHelper() helper = prometheus_helper.PrometheusHelper()
self.assertEqual(helper.prometheus_fqdn_instance_map, self.assertEqual(helper.prometheus_fqdn_labels,
expected_fqdn_map) expected_fqdn_list)
self.assertEqual(helper.prometheus_host_instance_map,
expected_host_map)
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
def test_build_prometheus_fqdn_host_instance_map_dupl_fqdn(
self, mock_prometheus_get):
mock_prometheus_get.return_value = {'data': {'activeTargets': [
{'labels': {
'fqdn': 'foo.controlplane.domain',
'instance': '10.1.2.1:9100', 'job': 'node',
}},
{'labels': {
'fqdn': 'foo.controlplane.domain',
'instance': '10.1.2.1:9229', 'job': 'podman',
}},
{'labels': {
'fqdn': 'bar.controlplane.domain',
'instance': '10.1.2.2:9100', 'job': 'node',
}},
{'labels': {
'fqdn': 'baz.controlplane.domain',
'instance': '10.1.2.3:9100', 'job': 'node',
}},
]}}
expected_fqdn_list = {'foo.controlplane.domain',
'bar.controlplane.domain',
'baz.controlplane.domain'}
expected_host_map = {'foo': 'foo.controlplane.domain',
'bar': 'bar.controlplane.domain',
'baz': 'baz.controlplane.domain'}
helper = prometheus_helper.PrometheusHelper()
self.assertEqual(helper.prometheus_fqdn_labels,
expected_fqdn_list)
self.assertEqual(helper.prometheus_host_instance_map, self.assertEqual(helper.prometheus_host_instance_map,
expected_host_map) expected_host_map)
@@ -460,7 +494,7 @@ class TestPrometheusHelper(base.BaseTestCase):
}}, }},
]}} ]}}
helper = prometheus_helper.PrometheusHelper() helper = prometheus_helper.PrometheusHelper()
self.assertEqual({}, helper.prometheus_fqdn_instance_map) self.assertEqual(set(), helper.prometheus_fqdn_labels)
self.assertEqual({}, helper.prometheus_host_instance_map) self.assertEqual({}, helper.prometheus_host_instance_map)
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get') @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
@@ -476,12 +510,29 @@ class TestPrometheusHelper(base.BaseTestCase):
}}, }},
]}} ]}}
helper = prometheus_helper.PrometheusHelper() helper = prometheus_helper.PrometheusHelper()
expected_fqdn_map = {'ena': '10.1.2.1:9100', expected_fqdn_list = {'ena', 'dyo'}
'dyo': '10.1.2.2:9100'}
self.assertEqual( self.assertEqual(
helper.prometheus_fqdn_instance_map, expected_fqdn_map) helper.prometheus_fqdn_labels, expected_fqdn_list)
self.assertEqual({}, helper.prometheus_host_instance_map) self.assertEqual({}, helper.prometheus_host_instance_map)
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
def test_using_ips_not_fqdn(self, mock_prometheus_get):
mock_prometheus_get.return_value = {'data': {'activeTargets': [
{'labels': {
'ip_label': '10.1.2.1',
'instance': '10.1.2.1:9100', 'job': 'node',
}},
{'labels': {
'ip_label': '10.1.2.2',
'instance': '10.1.2.2:9100', 'job': 'node',
}},
]}}
cfg.CONF.prometheus_client.fqdn_label = 'ip_label'
helper = prometheus_helper.PrometheusHelper()
expected_fqdn_list = {'10.1.2.1', '10.1.2.2'}
self.assertEqual(
helper.prometheus_fqdn_labels, expected_fqdn_list)
@mock.patch.object(prometheus_client.PrometheusAPIClient, '_get') @mock.patch.object(prometheus_client.PrometheusAPIClient, '_get')
def test_override_prometheus_fqdn_label(self, mock_prometheus_get): def test_override_prometheus_fqdn_label(self, mock_prometheus_get):
mock_prometheus_get.return_value = {'data': {'activeTargets': [ mock_prometheus_get.return_value = {'data': {'activeTargets': [
@@ -494,19 +545,19 @@ class TestPrometheusHelper(base.BaseTestCase):
'instance': '10.1.2.2:9100', 'job': 'node', 'instance': '10.1.2.2:9100', 'job': 'node',
}}, }},
]}} ]}}
expected_fqdn_map = {'foo.controlplane.domain': '10.1.2.1:9100', expected_fqdn_list = {'foo.controlplane.domain',
'bar.controlplane.domain': '10.1.2.2:9100'} 'bar.controlplane.domain'}
expected_host_map = {'foo': '10.1.2.1:9100', expected_host_map = {'foo': 'foo.controlplane.domain',
'bar': '10.1.2.2:9100'} 'bar': 'bar.controlplane.domain'}
cfg.CONF.prometheus_client.fqdn_label = 'custom_fqdn_label' cfg.CONF.prometheus_client.fqdn_label = 'custom_fqdn_label'
helper = prometheus_helper.PrometheusHelper() helper = prometheus_helper.PrometheusHelper()
self.assertEqual(helper.prometheus_fqdn_instance_map, self.assertEqual(helper.prometheus_fqdn_labels,
expected_fqdn_map) expected_fqdn_list)
self.assertEqual(helper.prometheus_host_instance_map, self.assertEqual(helper.prometheus_host_instance_map,
expected_host_map) expected_host_map)
def test_resolve_prometheus_instance_label(self): def test_resolve_prometheus_instance_label(self):
expected_instance_label = '10.0.1.2:9100' expected_instance_label = 'marios-env.controlplane.domain'
result = self.helper._resolve_prometheus_instance_label( result = self.helper._resolve_prometheus_instance_label(
'marios-env.controlplane.domain') 'marios-env.controlplane.domain')
self.assertEqual(result, expected_instance_label) self.assertEqual(result, expected_instance_label)
@@ -525,7 +576,7 @@ class TestPrometheusHelper(base.BaseTestCase):
def test_build_prometheus_query_node_cpu_avg_agg(self): def test_build_prometheus_query_node_cpu_avg_agg(self):
expected_query = ( expected_query = (
"100 - (avg by (instance)(rate(node_cpu_seconds_total" "100 - (avg by (instance)(rate(node_cpu_seconds_total"
"{mode='idle',instance='a_host'}[111s])) * 100)") "{mode='idle',fqdn='a_host'}[111s])) * 100)")
result = self.helper._build_prometheus_query( result = self.helper._build_prometheus_query(
'avg', 'node_cpu_seconds_total', 'a_host', '111') 'avg', 'node_cpu_seconds_total', 'a_host', '111')
self.assertEqual(result, expected_query) self.assertEqual(result, expected_query)
@@ -533,15 +584,15 @@ class TestPrometheusHelper(base.BaseTestCase):
def test_build_prometheus_query_node_cpu_max_agg(self): def test_build_prometheus_query_node_cpu_max_agg(self):
expected_query = ( expected_query = (
"100 - (max by (instance)(rate(node_cpu_seconds_total" "100 - (max by (instance)(rate(node_cpu_seconds_total"
"{mode='idle',instance='b_host'}[444s])) * 100)") "{mode='idle',fqdn='b_host'}[444s])) * 100)")
result = self.helper._build_prometheus_query( result = self.helper._build_prometheus_query(
'max', 'node_cpu_seconds_total', 'b_host', '444') 'max', 'node_cpu_seconds_total', 'b_host', '444')
self.assertEqual(result, expected_query) self.assertEqual(result, expected_query)
def test_build_prometheus_query_node_memory_avg_agg(self): def test_build_prometheus_query_node_memory_avg_agg(self):
expected_query = ( expected_query = (
"(node_memory_MemTotal_bytes{instance='c_host'} - avg_over_time" "(node_memory_MemTotal_bytes{fqdn='c_host'} - avg_over_time"
"(node_memory_MemAvailable_bytes{instance='c_host'}[555s])) " "(node_memory_MemAvailable_bytes{fqdn='c_host'}[555s])) "
"/ 1024 / 1024") "/ 1024 / 1024")
result = self.helper._build_prometheus_query( result = self.helper._build_prometheus_query(
'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555') 'avg', 'node_memory_MemAvailable_bytes', 'c_host', '555')
@@ -549,8 +600,27 @@ class TestPrometheusHelper(base.BaseTestCase):
def test_build_prometheus_query_node_memory_min_agg(self): def test_build_prometheus_query_node_memory_min_agg(self):
expected_query = ( expected_query = (
"(node_memory_MemTotal_bytes{instance='d_host'} - min_over_time" "(node_memory_MemTotal_bytes{fqdn='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{instance='d_host'}[222s])) " "(node_memory_MemAvailable_bytes{fqdn='d_host'}[222s])) "
"/ 1024 / 1024")
result = self.helper._build_prometheus_query(
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')
self.assertEqual(result, expected_query)
def test_build_prometheus_query_node_cpu_avg_agg_custom_label(self):
self.helper.prometheus_fqdn_label = 'custom_fqdn_label'
expected_query = (
"100 - (avg by (instance)(rate(node_cpu_seconds_total"
"{mode='idle',custom_fqdn_label='a_host'}[111s])) * 100)")
result = self.helper._build_prometheus_query(
'avg', 'node_cpu_seconds_total', 'a_host', '111')
self.assertEqual(result, expected_query)
def test_build_prometheus_query_node_memory_min_agg_custom_label(self):
self.helper.prometheus_fqdn_label = 'custom_fqdn'
expected_query = (
"(node_memory_MemTotal_bytes{custom_fqdn='d_host'} - min_over_time"
"(node_memory_MemAvailable_bytes{custom_fqdn='d_host'}[222s])) "
"/ 1024 / 1024") "/ 1024 / 1024")
result = self.helper._build_prometheus_query( result = self.helper._build_prometheus_query(
'min', 'node_memory_MemAvailable_bytes', 'd_host', '222') 'min', 'node_memory_MemAvailable_bytes', 'd_host', '222')