- Introduced a new API endpoint `/api/source-status/` to return the status of Prometheus and OpenStack data sources. - Implemented lightweight health check functions for both Prometheus and OpenStack. - Updated the dashboard template to display the status of data sources dynamically. - Added tests for the new API endpoint to ensure correct functionality and response handling. - Configured a cache timeout for source status checks to improve performance.
287 lines
12 KiB
Python
287 lines
12 KiB
Python
import json
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
from django.conf import settings
|
|
from django.core.cache import cache
|
|
from django.http import JsonResponse
|
|
from django.shortcuts import render
|
|
from dashboard.openstack_utils.connect import check_openstack, get_connection
|
|
from dashboard.openstack_utils.flavor import get_flavor_list
|
|
from dashboard.prometheus_utils.query import check_prometheus, query_prometheus
|
|
from dashboard.openstack_utils.audits import get_audits, get_current_cluster_cpu
|
|
from dashboard.mock_data import get_mock_context
|
|
|
|
# Prometheus queries run in parallel (query_key -> query string)
|
|
_PROMETHEUS_QUERIES = {
|
|
"hosts_total": "count(node_exporter_build_info{job='node_exporter_compute'})",
|
|
"pcpu_total": "sum(count(node_cpu_seconds_total{job='node_exporter_compute', mode='idle'}) without (cpu,mode))",
|
|
"pcpu_usage": "sum(node_load5{job='node_exporter_compute'})",
|
|
"vcpu_allocated": "sum(libvirt_domain_info_virtual_cpus)",
|
|
"vcpu_overcommit_max": "avg(openstack_placement_resource_allocation_ratio{resourcetype='VCPU'})",
|
|
"pram_total": "sum(node_memory_MemTotal_bytes{job='node_exporter_compute'})",
|
|
"pram_usage": "sum(node_memory_Active_bytes{job='node_exporter_compute'})",
|
|
"vram_allocated": "sum(libvirt_domain_info_maximum_memory_bytes)",
|
|
"vram_overcommit_max": "avg(avg_over_time(openstack_placement_resource_allocation_ratio{resourcetype='MEMORY_MB'}[5m]))",
|
|
"vm_count": "sum(libvirt_domain_state_code)",
|
|
"vm_active": "sum(libvirt_domain_state_code{stateDesc='the domain is running'})",
|
|
}
|
|
|
|
|
|
def _fetch_prometheus_metrics():
|
|
"""Run all Prometheus queries in parallel and return a dict of name -> value."""
|
|
result = {}
|
|
with ThreadPoolExecutor(max_workers=len(_PROMETHEUS_QUERIES)) as executor:
|
|
future_to_key = {
|
|
executor.submit(query_prometheus, query=q): key
|
|
for key, q in _PROMETHEUS_QUERIES.items()
|
|
}
|
|
for future in as_completed(future_to_key):
|
|
key = future_to_key[future]
|
|
try:
|
|
raw = future.result()
|
|
if key in ("pcpu_usage", "vcpu_overcommit_max", "vram_overcommit_max"):
|
|
result[key] = float(raw)
|
|
else:
|
|
result[key] = int(raw)
|
|
except (ValueError, TypeError):
|
|
result[key] = 0 if key in ("pcpu_usage", "vcpu_overcommit_max", "vram_overcommit_max") else 0
|
|
return result
|
|
|
|
|
|
def collect_context():
|
|
connection = get_connection()
|
|
region_name = connection._compute_region
|
|
flavors = get_flavor_list(connection=connection)
|
|
audits = get_audits(connection=connection)
|
|
|
|
metrics = _fetch_prometheus_metrics()
|
|
hosts_total = metrics.get("hosts_total") or 1
|
|
pcpu_total = metrics.get("pcpu_total", 0)
|
|
pcpu_usage = metrics.get("pcpu_usage", 0)
|
|
vcpu_allocated = metrics.get("vcpu_allocated", 0)
|
|
vcpu_overcommit_max = metrics.get("vcpu_overcommit_max", 0)
|
|
pram_total = metrics.get("pram_total", 0)
|
|
pram_usage = metrics.get("pram_usage", 0)
|
|
vram_allocated = metrics.get("vram_allocated", 0)
|
|
vram_overcommit_max = metrics.get("vram_overcommit_max", 0)
|
|
vm_count = metrics.get("vm_count", 0)
|
|
vm_active = metrics.get("vm_active", 0)
|
|
|
|
vcpu_total = pcpu_total * vcpu_overcommit_max
|
|
vram_total = pram_total * vram_overcommit_max
|
|
|
|
context = {
|
|
# <--- Region data --->
|
|
"region": {
|
|
"name": region_name,
|
|
"hosts_total": hosts_total,
|
|
},
|
|
# <--- CPU data --->
|
|
# pCPU data
|
|
"pcpu": {
|
|
"total": pcpu_total,
|
|
"usage": pcpu_usage,
|
|
"free": pcpu_total - pcpu_usage,
|
|
"used_percentage": (pcpu_usage / pcpu_total * 100) if pcpu_total else 0,
|
|
},
|
|
# vCPU data
|
|
"vcpu": {
|
|
"total": vcpu_total,
|
|
"allocated": vcpu_allocated,
|
|
"free": vcpu_total - vcpu_allocated,
|
|
"allocated_percentage": (vcpu_allocated / vcpu_total * 100) if vcpu_total else 0,
|
|
"overcommit_ratio": (vcpu_allocated / pcpu_total) if pcpu_total else 0,
|
|
"overcommit_max": vcpu_overcommit_max,
|
|
},
|
|
# <--- RAM data --->
|
|
# pRAM data
|
|
"pram": {
|
|
"total": pram_total,
|
|
"usage": pram_usage,
|
|
"free": pram_total - pram_usage,
|
|
"used_percentage": (pram_usage / pram_total * 100) if pram_total else 0,
|
|
},
|
|
# vRAM data
|
|
"vram": {
|
|
"total": vram_total,
|
|
"allocated": vram_allocated,
|
|
"free": vram_total - vram_allocated,
|
|
"allocated_percentage": (vram_allocated / vram_total * 100) if vram_total else 0,
|
|
"overcommit_ratio": (vram_allocated / pram_total) if pram_total else 0,
|
|
"overcommit_max": vram_overcommit_max,
|
|
},
|
|
# <--- VM data --->
|
|
"vm": {
|
|
"count": vm_count,
|
|
"active": vm_active,
|
|
"stopped": vm_count - vm_active,
|
|
"avg_cpu": vcpu_allocated / vm_count if vm_count else 0,
|
|
"avg_ram": vram_allocated / vm_count if vm_count else 0,
|
|
"density": vm_count / hosts_total if hosts_total else 0,
|
|
},
|
|
"flavors": flavors,
|
|
"audits": audits,
|
|
}
|
|
current_cluster = get_current_cluster_cpu(connection)
|
|
context["current_cluster"] = {
|
|
"host_labels": json.dumps(current_cluster["host_labels"]),
|
|
"cpu_current": json.dumps(current_cluster["cpu_current"]),
|
|
}
|
|
# Serialize audit list fields for JavaScript so cached context is render-ready
|
|
for audit in context["audits"]:
|
|
audit["migrations"] = json.dumps(audit["migrations"])
|
|
audit["host_labels"] = json.dumps(audit["host_labels"])
|
|
audit["cpu_current"] = json.dumps(audit["cpu_current"])
|
|
audit["cpu_projected"] = json.dumps(audit["cpu_projected"])
|
|
return context
|
|
|
|
|
|
def collect_stats():
|
|
"""Build stats dict: region, pcpu, pram, vcpu, vram, vm, flavors (no audits)."""
|
|
connection = get_connection()
|
|
region_name = connection._compute_region
|
|
flavors = get_flavor_list(connection=connection)
|
|
metrics = _fetch_prometheus_metrics()
|
|
hosts_total = metrics.get("hosts_total") or 1
|
|
pcpu_total = metrics.get("pcpu_total", 0)
|
|
pcpu_usage = metrics.get("pcpu_usage", 0)
|
|
vcpu_allocated = metrics.get("vcpu_allocated", 0)
|
|
vcpu_overcommit_max = metrics.get("vcpu_overcommit_max", 0)
|
|
pram_total = metrics.get("pram_total", 0)
|
|
pram_usage = metrics.get("pram_usage", 0)
|
|
vram_allocated = metrics.get("vram_allocated", 0)
|
|
vram_overcommit_max = metrics.get("vram_overcommit_max", 0)
|
|
vm_count = metrics.get("vm_count", 0)
|
|
vm_active = metrics.get("vm_active", 0)
|
|
vcpu_total = pcpu_total * vcpu_overcommit_max
|
|
vram_total = pram_total * vram_overcommit_max
|
|
return {
|
|
"region": {"name": region_name, "hosts_total": hosts_total},
|
|
"pcpu": {
|
|
"total": pcpu_total,
|
|
"usage": pcpu_usage,
|
|
"free": pcpu_total - pcpu_usage,
|
|
"used_percentage": (pcpu_usage / pcpu_total * 100) if pcpu_total else 0,
|
|
},
|
|
"vcpu": {
|
|
"total": vcpu_total,
|
|
"allocated": vcpu_allocated,
|
|
"free": vcpu_total - vcpu_allocated,
|
|
"allocated_percentage": (vcpu_allocated / vcpu_total * 100) if vcpu_total else 0,
|
|
"overcommit_ratio": (vcpu_allocated / pcpu_total) if pcpu_total else 0,
|
|
"overcommit_max": vcpu_overcommit_max,
|
|
},
|
|
"pram": {
|
|
"total": pram_total,
|
|
"usage": pram_usage,
|
|
"free": pram_total - pram_usage,
|
|
"used_percentage": (pram_usage / pram_total * 100) if pram_total else 0,
|
|
},
|
|
"vram": {
|
|
"total": vram_total,
|
|
"allocated": vram_allocated,
|
|
"free": vram_total - vram_allocated,
|
|
"allocated_percentage": (vram_allocated / vram_total * 100) if vram_total else 0,
|
|
"overcommit_ratio": (vram_allocated / pram_total) if pram_total else 0,
|
|
"overcommit_max": vram_overcommit_max,
|
|
},
|
|
"vm": {
|
|
"count": vm_count,
|
|
"active": vm_active,
|
|
"stopped": vm_count - vm_active,
|
|
"avg_cpu": vcpu_allocated / vm_count if vm_count else 0,
|
|
"avg_ram": vram_allocated / vm_count if vm_count else 0,
|
|
"density": vm_count / hosts_total if hosts_total else 0,
|
|
},
|
|
"flavors": flavors,
|
|
}
|
|
|
|
|
|
def collect_audits():
|
|
"""Build audits list with serialized fields for frontend."""
|
|
connection = get_connection()
|
|
audits = get_audits(connection=connection)
|
|
for audit in audits:
|
|
audit["migrations"] = json.dumps(audit["migrations"])
|
|
audit["host_labels"] = json.dumps(audit["host_labels"])
|
|
audit["cpu_current"] = json.dumps(audit["cpu_current"])
|
|
audit["cpu_projected"] = json.dumps(audit["cpu_projected"])
|
|
return audits
|
|
|
|
|
|
def _skeleton_context():
|
|
"""Minimal context for skeleton-only index render."""
|
|
empty_flavors = {
|
|
"first_common_flavor": {"name": "—", "count": 0},
|
|
"second_common_flavor": None,
|
|
"third_common_flavor": None,
|
|
}
|
|
return {
|
|
"skeleton": True,
|
|
"region": {"name": "—", "hosts_total": 0},
|
|
"pcpu": {"total": 0, "usage": 0, "free": 0, "used_percentage": 0},
|
|
"pram": {"total": 0, "usage": 0, "free": 0, "used_percentage": 0},
|
|
"vcpu": {"total": 0, "allocated": 0, "free": 0, "allocated_percentage": 0, "overcommit_ratio": 0, "overcommit_max": 0},
|
|
"vram": {"total": 0, "allocated": 0, "free": 0, "allocated_percentage": 0, "overcommit_ratio": 0, "overcommit_max": 0},
|
|
"vm": {"count": 0, "active": 0, "stopped": 0, "avg_cpu": 0, "avg_ram": 0, "density": 0},
|
|
"flavors": empty_flavors,
|
|
"audits": [],
|
|
"current_cluster": {
|
|
"host_labels": "[]",
|
|
"cpu_current": "[]",
|
|
},
|
|
}
|
|
|
|
|
|
def index(request):
|
|
if getattr(settings, "USE_MOCK_DATA", False):
|
|
context = get_mock_context()
|
|
return render(request, "index.html", context)
|
|
context = _skeleton_context()
|
|
return render(request, "index.html", context)
|
|
|
|
|
|
def api_stats(request):
|
|
cache_key = "dashboard_stats"
|
|
cache_ttl = getattr(settings, "DASHBOARD_CACHE_TTL", 120)
|
|
data = cache.get(cache_key)
|
|
if data is None:
|
|
data = collect_stats()
|
|
cache.set(cache_key, data, timeout=cache_ttl)
|
|
return JsonResponse(data)
|
|
|
|
|
|
def api_audits(request):
|
|
cache_key_audits = "dashboard_audits"
|
|
cache_key_cluster = "dashboard_current_cluster"
|
|
cache_ttl = getattr(settings, "DASHBOARD_CACHE_TTL", 120)
|
|
audits = cache.get(cache_key_audits)
|
|
current_cluster = cache.get(cache_key_cluster)
|
|
if audits is None:
|
|
audits = collect_audits()
|
|
cache.set(cache_key_audits, audits, timeout=cache_ttl)
|
|
if current_cluster is None:
|
|
connection = get_connection()
|
|
current_cluster = get_current_cluster_cpu(connection)
|
|
cache.set(cache_key_cluster, current_cluster, timeout=cache_ttl)
|
|
return JsonResponse({"audits": audits, "current_cluster": current_cluster})
|
|
|
|
|
|
def api_source_status(request):
|
|
"""Return status of Prometheus and OpenStack data sources (ok / error / mock)."""
|
|
if getattr(settings, "USE_MOCK_DATA", False):
|
|
return JsonResponse({
|
|
"prometheus": {"status": "mock"},
|
|
"openstack": {"status": "mock"},
|
|
})
|
|
|
|
cache_key = "dashboard_source_status"
|
|
cache_ttl = getattr(settings, "SOURCE_STATUS_CACHE_TTL", 30)
|
|
data = cache.get(cache_key)
|
|
if data is None:
|
|
data = {
|
|
"prometheus": check_prometheus(),
|
|
"openstack": check_openstack(),
|
|
}
|
|
cache.set(cache_key, data, timeout=cache_ttl)
|
|
return JsonResponse(data) |