Add metrics collection scheduler with automatic job management, improve terminal session cleanup on 404/403 errors, and refactor metrics collection endpoint to use shared background job function with WebSocket broadcast support
Some checks failed
Tests / Backend Tests (Python) (3.10) (push) Has been cancelled
Tests / Backend Tests (Python) (3.11) (push) Has been cancelled
Tests / Backend Tests (Python) (3.12) (push) Has been cancelled
Tests / Frontend Tests (JS) (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / All Tests Passed (push) Has been cancelled

This commit is contained in:
Bruno Charest 2025-12-21 20:51:18 -05:00
parent 70c15c9b6f
commit 6c51fb5c75
8 changed files with 269 additions and 89 deletions

View File

@ -171,6 +171,10 @@ def create_app() -> FastAPI:
# Démarrer le scheduler
await scheduler_service.start_async()
# Charger et appliquer la planification de la collecte des métriques
from app.services.metrics_collection_scheduler import load_and_apply_from_db
await load_and_apply_from_db()
# Ajouter les jobs Docker au scheduler
from app.services.docker_service import docker_service
from app.services.docker_alerts import docker_alerts_service

View File

@ -10990,7 +10990,11 @@ class DashboardManager {
});
} catch (e) {
console.warn('Terminal heartbeat failed:', e);
// Session might be expired - don't stop heartbeat, let server handle it
if (e && (e.status === 404 || e.status === 403)) {
this.stopTerminalHeartbeat();
this.terminalSession = null;
return;
}
}
}, this.terminalHeartbeatIntervalMs);

View File

@ -76,6 +76,107 @@ def _get_service() -> BuiltinPlaybookService:
return init_builtin_playbook_service(settings.ansible_dir, ansible_service)
async def collect_all_metrics_job() -> None:
logger.info("[METRICS_JOB] Starting scheduled metrics collection")
service = _get_service()
results = []
async with async_session_maker() as session:
host_repo_bg = HostRepository(session)
metrics_repo = HostMetricsRepository(session)
try:
hosts_bg = await host_repo_bg.list(limit=1000)
except OperationalError:
return
if not hosts_bg:
await _sync_hosts_from_inventory(session)
hosts_bg = await host_repo_bg.list(limit=1000)
inventory_hosts = ansible_service.get_hosts_from_inventory()
inventory_names = {h.name for h in (inventory_hosts or []) if getattr(h, "name", None)}
skipped = 0
if inventory_names:
filtered = [h for h in hosts_bg if h.name in inventory_names]
skipped = len(hosts_bg) - len(filtered)
hosts_bg = filtered
for host in hosts_bg:
try:
result = await service.execute_builtin("collect_system_info", host.name)
ok = bool(result.get("success", False))
stderr = (result.get("stderr") or "").strip()
stdout = (result.get("stdout") or "").strip()
return_code = result.get("return_code")
error_detail = None
if not ok:
if stderr:
error_detail = stderr[:400]
elif stdout:
error_detail = stdout[:400]
else:
error_detail = result.get("error") or "Unknown error"
logger.warning(
"[METRICS] Collection failed for host=%s return_code=%s error=%s",
host.name,
return_code,
error_detail,
)
if ok:
for hostname, metrics_data in result.get("parsed_metrics", {}).items():
target_host = await host_repo_bg.get_by_name(hostname)
if not target_host:
continue
metrics_create = service.create_metrics_from_parsed(
host_id=target_host.id,
parsed_data=metrics_data,
builtin_id="collect_system_info",
execution_time_ms=result.get("execution_time_ms", 0),
)
await metrics_repo.create(**metrics_create.model_dump())
await session.commit()
results.append({
"host": host.name,
"success": ok,
"return_code": return_code,
"error": error_detail,
})
except Exception as e:
await session.rollback()
logger.exception("[METRICS] Unhandled exception during collection for host=%s", host.name)
results.append({
"host": host.name,
"success": False,
"error": str(e)
})
logger.info(
"[METRICS_JOB] Collection complete: total=%d success=%d failed=%d skipped=%d",
len(results),
sum(1 for r in results if r.get("success")),
sum(1 for r in results if not r.get("success")),
skipped,
)
await ws_manager.broadcast({
"type": "metrics_collection_complete",
"data": {
"total": len(results),
"success": sum(1 for r in results if r.get("success")),
"failed": sum(1 for r in results if not r.get("success")),
"skipped": skipped,
"results": results,
}
})
@router.get("")
async def list_builtin_playbooks(api_key_valid: bool = Depends(verify_api_key)):
"""Liste tous les builtin playbooks disponibles."""
@ -194,94 +295,8 @@ async def collect_all_metrics(
if not hosts:
return {"success": True, "message": "Aucun hôte trouvé", "hosts_count": 0}
async def collect_for_all():
service = _get_service()
results = []
async with async_session_maker() as session:
host_repo_bg = HostRepository(session)
metrics_repo = HostMetricsRepository(session)
hosts_bg = await host_repo_bg.list(limit=1000)
inventory_hosts = ansible_service.get_hosts_from_inventory()
inventory_names = {h.name for h in (inventory_hosts or []) if getattr(h, "name", None)}
skipped = 0
if inventory_names:
filtered = [h for h in hosts_bg if h.name in inventory_names]
skipped = len(hosts_bg) - len(filtered)
hosts_bg = filtered
for host in hosts_bg:
try:
result = await service.execute_builtin("collect_system_info", host.name)
ok = bool(result.get("success", False))
stderr = (result.get("stderr") or "").strip()
stdout = (result.get("stdout") or "").strip()
return_code = result.get("return_code")
error_detail = None
if not ok:
# Keep payload small but actionable
if stderr:
error_detail = stderr[:400]
elif stdout:
error_detail = stdout[:400]
else:
error_detail = result.get("error") or "Unknown error"
logger.warning(
"[METRICS] Collection failed for host=%s return_code=%s error=%s",
host.name,
return_code,
error_detail,
)
if ok:
for hostname, metrics_data in result.get("parsed_metrics", {}).items():
target_host = await host_repo_bg.get_by_name(hostname)
if not target_host:
continue
metrics_create = service.create_metrics_from_parsed(
host_id=target_host.id,
parsed_data=metrics_data,
builtin_id="collect_system_info",
execution_time_ms=result.get("execution_time_ms", 0),
)
await metrics_repo.create(**metrics_create.model_dump())
await session.commit()
results.append({
"host": host.name,
"success": ok,
"return_code": return_code,
"error": error_detail,
})
except Exception as e:
await session.rollback()
logger.exception("[METRICS] Unhandled exception during collection for host=%s", host.name)
results.append({
"host": host.name,
"success": False,
"error": str(e)
})
await ws_manager.broadcast({
"type": "metrics_collection_complete",
"data": {
"total": len(results),
"success": sum(1 for r in results if r.get("success")),
"failed": sum(1 for r in results if not r.get("success")),
"skipped": skipped,
"results": results,
}
})
# Lancer en arrière-plan via la boucle asyncio active (retour immédiat)
asyncio.create_task(collect_for_all())
asyncio.create_task(collect_all_metrics_job())
return {
"success": True,

View File

@ -17,6 +17,7 @@ from app.core.config import settings
from app.crud.host_metrics import HostMetricsRepository
from app.crud.app_setting import AppSettingRepository
from app.services import db
from app.services.metrics_collection_scheduler import apply_interval_using_global_scheduler
class CollectionScheduleRequest(BaseModel):
@ -123,6 +124,8 @@ async def set_collection_schedule(
repo = AppSettingRepository(db_session)
await repo.set("metrics_collection_interval", interval)
await db_session.commit()
apply_interval_using_global_scheduler(interval)
return {"interval": interval, "message": f"Intervalle de collecte défini à {interval}"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -1523,11 +1523,15 @@ function executeHistoryCommand(index) {
" try {\n"
" const headers = { 'Content-Type': 'application/json' };\n"
" headers['Authorization'] = 'Bearer ' + TOKEN;\n"
" await fetch('/api/terminal/sessions/' + SESSION_ID + '/heartbeat', {\n"
" const resp = await fetch('/api/terminal/sessions/' + SESSION_ID + '/heartbeat', {\n"
" method: 'POST',\n"
" headers,\n"
" body: JSON.stringify({})\n"
" });\n"
" if (resp.status === 404 || resp.status === 403) {\n"
" if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }\n"
" console.warn('Session expired or invalid, heartbeat stopped');\n"
" }\n"
" } catch (e) {}\n"
" }\n\n"
" function startHeartbeat() {\n"

View File

@ -0,0 +1,77 @@
"""Planification de la collecte automatique des métriques."""
from __future__ import annotations
import logging
from typing import Optional
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from app.models.database import async_session_maker
logger = logging.getLogger(__name__)
METRICS_COLLECTION_JOB_ID = "metrics_collect_all"
def interval_to_seconds(interval: Optional[str]) -> Optional[int]:
if not interval or interval == "off":
return None
mapping = {
"5min": 5 * 60,
"15min": 15 * 60,
"30min": 30 * 60,
"1h": 60 * 60,
"6h": 6 * 60 * 60,
"12h": 12 * 60 * 60,
"24h": 24 * 60 * 60,
}
return mapping.get(interval)
def apply_interval(scheduler: AsyncIOScheduler, interval: str) -> None:
seconds = interval_to_seconds(interval)
existing = scheduler.get_job(METRICS_COLLECTION_JOB_ID)
if seconds is None:
if existing:
scheduler.remove_job(METRICS_COLLECTION_JOB_ID)
logger.info("[METRICS_SCHEDULER] Job removed (interval=off)")
return
from app.routes.builtin_playbooks import collect_all_metrics_job
scheduler.add_job(
collect_all_metrics_job,
trigger="interval",
seconds=seconds,
id=METRICS_COLLECTION_JOB_ID,
name="Metrics Collection",
replace_existing=True,
)
logger.info(f"[METRICS_SCHEDULER] Job scheduled: interval={interval} ({seconds}s)")
def apply_interval_using_global_scheduler(interval: str) -> None:
from app.services import scheduler_service
apply_interval(scheduler_service.scheduler, interval)
async def load_and_apply_from_db() -> str:
from app.crud.app_setting import AppSettingRepository
from app.services import scheduler_service
interval = "off"
async with async_session_maker() as session:
repo = AppSettingRepository(session)
row = await repo.get("metrics_collection_interval")
if row and row.value:
interval = row.value
logger.info(f"[METRICS_SCHEDULER] Loading from DB: interval={interval}")
apply_interval(scheduler_service.scheduler, interval)
return interval

View File

@ -0,0 +1,58 @@
# ✅ Vérification de santé
## Informations
| Propriété | Valeur |
|-----------|--------|
| **ID** | `fee8d10e08d4469b82f30d9dbcc3be81` |
| **Nom** | Vérification de santé |
| **Cible** | `ali2v.xeon.home` |
| **Statut** | completed |
| **Type** | Manuel |
| **Progression** | 100% |
| **Début** | 2025-12-22T01:40:53.655836+00:00 |
| **Fin** | 2025-12-22T01:41:00.681248+00:00 |
| **Durée** | 7.0s |
## Sortie
```
Using /mnt/c/dev/git/python/homelab-automation-api-v2/ansible/ansible.cfg as config file
PLAY [Health check on target host] *********************************************
TASK [Check if host is reachable (ping)] ***************************************
ok: [ali2v.xeon.home] => {"changed": false, "ping": "pong"}
TASK [Gather minimal facts] ****************************************************
ok: [ali2v.xeon.home]
TASK [Get system uptime] *******************************************************
ok: [ali2v.xeon.home] => {"changed": false, "cmd": ["uptime"], "delta": "0:00:00.002844", "end": "2025-12-21 20:40:58.450286", "msg": "", "rc": 0, "start": "2025-12-21 20:40:58.447442", "stderr": "", "stderr_lines": [], "stdout": " 20:40:58 up 1 day, 21:40, 1 user, load average: 0.42, 0.39, 0.36", "stdout_lines": [" 20:40:58 up 1 day, 21:40, 1 user, load average: 0.42, 0.39, 0.36"]}
TASK [Get disk usage] **********************************************************
ok: [ali2v.xeon.home] => {"changed": false, "cmd": "df -h / | tail -1 | awk '{print $5}'", "delta": "0:00:00.004091", "end": "2025-12-21 20:40:58.905596", "msg": "", "rc": 0, "start": "2025-12-21 20:40:58.901505", "stderr": "", "stderr_lines": [], "stdout": "22%", "stdout_lines": ["22%"]}
TASK [Get memory usage (Linux)] ************************************************
ok: [ali2v.xeon.home] => {"changed": false, "cmd": "if command -v free >/dev/null 2>&1; then\n free -m | grep Mem | awk '{printf \"%.1f%%\", $3/$2 * 100}'\nelse\n # Fallback for systems without free command\n cat /proc/meminfo | awk '/MemTotal/{total=$2} /MemAvailable/{avail=$2} END{printf \"%.1f%%\", (total-avail)/total*100}'\nfi\n", "delta": "0:00:00.004129", "end": "2025-12-21 20:40:59.332276", "msg": "", "rc": 0, "start": "2025-12-21 20:40:59.328147", "stderr": "", "stderr_lines": [], "stdout": "20.5%", "stdout_lines": ["20.5%"]}
TASK [Get CPU temperature (ARM/SBC)] *******************************************
ok: [ali2v.xeon.home] => {"changed": false, "cmd": "if [ -f /sys/class/thermal/thermal_zone0/temp ]; then\n temp=$(cat /sys/class/thermal/thermal_zone0/temp)\n # Use awk instead of bc for better compatibility\n echo \"${temp}\" | awk '{printf \"%.1f°C\", $1/1000}'\nelse\n echo \"N/A\"\nfi\n", "delta": "0:00:00.004757", "end": "2025-12-21 20:40:59.742177", "msg": "", "rc": 0, "start": "2025-12-21 20:40:59.737420", "stderr": "", "stderr_lines": [], "stdout": "47.0°C", "stdout_lines": ["47.0°C"]}
TASK [Get CPU load] ************************************************************
ok: [ali2v.xeon.home] => {"changed": false, "cmd": "if [ -f /proc/loadavg ]; then\n cat /proc/loadavg | awk '{print $1}'\nelse\n uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' '\nfi\n", "delta": "0:00:00.004205", "end": "2025-12-21 20:41:00.154244", "msg": "", "rc": 0, "start": "2025-12-21 20:41:00.150039", "stderr": "", "stderr_lines": [], "stdout": "0.42", "stdout_lines": ["0.42"]}
TASK [Display health status] ***************************************************
ok: [ali2v.xeon.home] => {
"msg": "═══════════════════════════════════════\nHost: ali2v.xeon.home\nStatus: OK\n═══════════════════════════════════════\nUptime: 20:40:58 up 1 day, 21:40, 1 user, load average: 0.42, 0.39, 0.36\nDisk Usage: 22%\nMemory Usage: 20.5%\nCPU Load: 0.42\nCPU Temp: 47.0°C\n═══════════════════════════════════════\n"
}
PLAY RECAP *********************************************************************
ali2v.xeon.home : ok=8 changed=0 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
```
---
*Généré automatiquement par Homelab Automation Dashboard*
*Date: 2025-12-22T01:41:00.705253+00:00*

View File

@ -105,6 +105,21 @@ class TestSetCollectionSchedule:
data = response.json()
assert data["interval"] == "30min"
@pytest.mark.asyncio
async def test_set_schedule_triggers_reschedule(self, client: AsyncClient):
"""Le changement d'intervalle doit rescheduler le job de collecte."""
with patch(
"app.routes.metrics.apply_interval_using_global_scheduler",
autospec=True,
) as mock_apply:
response = await client.post(
"/api/metrics/collection-schedule",
json={"interval": "5min"}
)
assert response.status_code == 200
mock_apply.assert_called_once_with("5min")
@pytest.mark.asyncio
async def test_set_schedule_off(self, client: AsyncClient):
"""Désactivation de la collecte."""