From 6c51fb5c75e17fb6281adba88fb2ba9b9f4dbb41 Mon Sep 17 00:00:00 2001 From: Bruno Charest Date: Sun, 21 Dec 2025 20:51:18 -0500 Subject: [PATCH] Add metrics collection scheduler with automatic job management, improve terminal session cleanup on 404/403 errors, and refactor metrics collection endpoint to use shared background job function with WebSocket broadcast support --- app/factory.py | 4 + app/main.js | 6 +- app/routes/builtin_playbooks.py | 189 ++++++++++-------- app/routes/metrics.py | 3 + app/routes/terminal.py | 6 +- app/services/metrics_collection_scheduler.py | 77 +++++++ ...on.home_Vérification_de_santé_completed.md | 58 ++++++ tests/backend/test_routes_metrics.py | 15 ++ 8 files changed, 269 insertions(+), 89 deletions(-) create mode 100644 app/services/metrics_collection_scheduler.py create mode 100644 logs/tasks_logs/2025/12/21/task_014100_820ec1_ali2v.xeon.home_Vérification_de_santé_completed.md diff --git a/app/factory.py b/app/factory.py index 07b16d3..b78c51c 100644 --- a/app/factory.py +++ b/app/factory.py @@ -171,6 +171,10 @@ def create_app() -> FastAPI: # Démarrer le scheduler await scheduler_service.start_async() + # Charger et appliquer la planification de la collecte des métriques + from app.services.metrics_collection_scheduler import load_and_apply_from_db + await load_and_apply_from_db() + # Ajouter les jobs Docker au scheduler from app.services.docker_service import docker_service from app.services.docker_alerts import docker_alerts_service diff --git a/app/main.js b/app/main.js index b455096..c90bfd5 100644 --- a/app/main.js +++ b/app/main.js @@ -10990,7 +10990,11 @@ class DashboardManager { }); } catch (e) { console.warn('Terminal heartbeat failed:', e); - // Session might be expired - don't stop heartbeat, let server handle it + if (e && (e.status === 404 || e.status === 403)) { + this.stopTerminalHeartbeat(); + this.terminalSession = null; + return; + } } }, this.terminalHeartbeatIntervalMs); diff --git a/app/routes/builtin_playbooks.py b/app/routes/builtin_playbooks.py index 778f41c..3d92072 100644 --- a/app/routes/builtin_playbooks.py +++ b/app/routes/builtin_playbooks.py @@ -76,6 +76,107 @@ def _get_service() -> BuiltinPlaybookService: return init_builtin_playbook_service(settings.ansible_dir, ansible_service) +async def collect_all_metrics_job() -> None: + logger.info("[METRICS_JOB] Starting scheduled metrics collection") + service = _get_service() + results = [] + + async with async_session_maker() as session: + host_repo_bg = HostRepository(session) + metrics_repo = HostMetricsRepository(session) + + try: + hosts_bg = await host_repo_bg.list(limit=1000) + except OperationalError: + return + + if not hosts_bg: + await _sync_hosts_from_inventory(session) + hosts_bg = await host_repo_bg.list(limit=1000) + + inventory_hosts = ansible_service.get_hosts_from_inventory() + inventory_names = {h.name for h in (inventory_hosts or []) if getattr(h, "name", None)} + skipped = 0 + if inventory_names: + filtered = [h for h in hosts_bg if h.name in inventory_names] + skipped = len(hosts_bg) - len(filtered) + hosts_bg = filtered + + for host in hosts_bg: + try: + result = await service.execute_builtin("collect_system_info", host.name) + ok = bool(result.get("success", False)) + + stderr = (result.get("stderr") or "").strip() + stdout = (result.get("stdout") or "").strip() + return_code = result.get("return_code") + + error_detail = None + if not ok: + if stderr: + error_detail = stderr[:400] + elif stdout: + error_detail = stdout[:400] + else: + error_detail = result.get("error") or "Unknown error" + + logger.warning( + "[METRICS] Collection failed for host=%s return_code=%s error=%s", + host.name, + return_code, + error_detail, + ) + + if ok: + for hostname, metrics_data in result.get("parsed_metrics", {}).items(): + target_host = await host_repo_bg.get_by_name(hostname) + if not target_host: + continue + + metrics_create = service.create_metrics_from_parsed( + host_id=target_host.id, + parsed_data=metrics_data, + builtin_id="collect_system_info", + execution_time_ms=result.get("execution_time_ms", 0), + ) + await metrics_repo.create(**metrics_create.model_dump()) + + await session.commit() + + results.append({ + "host": host.name, + "success": ok, + "return_code": return_code, + "error": error_detail, + }) + except Exception as e: + await session.rollback() + logger.exception("[METRICS] Unhandled exception during collection for host=%s", host.name) + results.append({ + "host": host.name, + "success": False, + "error": str(e) + }) + + logger.info( + "[METRICS_JOB] Collection complete: total=%d success=%d failed=%d skipped=%d", + len(results), + sum(1 for r in results if r.get("success")), + sum(1 for r in results if not r.get("success")), + skipped, + ) + await ws_manager.broadcast({ + "type": "metrics_collection_complete", + "data": { + "total": len(results), + "success": sum(1 for r in results if r.get("success")), + "failed": sum(1 for r in results if not r.get("success")), + "skipped": skipped, + "results": results, + } + }) + + @router.get("") async def list_builtin_playbooks(api_key_valid: bool = Depends(verify_api_key)): """Liste tous les builtin playbooks disponibles.""" @@ -194,94 +295,8 @@ async def collect_all_metrics( if not hosts: return {"success": True, "message": "Aucun hôte trouvé", "hosts_count": 0} - async def collect_for_all(): - service = _get_service() - results = [] - - async with async_session_maker() as session: - host_repo_bg = HostRepository(session) - metrics_repo = HostMetricsRepository(session) - - hosts_bg = await host_repo_bg.list(limit=1000) - - inventory_hosts = ansible_service.get_hosts_from_inventory() - inventory_names = {h.name for h in (inventory_hosts or []) if getattr(h, "name", None)} - skipped = 0 - if inventory_names: - filtered = [h for h in hosts_bg if h.name in inventory_names] - skipped = len(hosts_bg) - len(filtered) - hosts_bg = filtered - - for host in hosts_bg: - try: - result = await service.execute_builtin("collect_system_info", host.name) - ok = bool(result.get("success", False)) - - stderr = (result.get("stderr") or "").strip() - stdout = (result.get("stdout") or "").strip() - return_code = result.get("return_code") - - error_detail = None - if not ok: - # Keep payload small but actionable - if stderr: - error_detail = stderr[:400] - elif stdout: - error_detail = stdout[:400] - else: - error_detail = result.get("error") or "Unknown error" - - logger.warning( - "[METRICS] Collection failed for host=%s return_code=%s error=%s", - host.name, - return_code, - error_detail, - ) - - if ok: - for hostname, metrics_data in result.get("parsed_metrics", {}).items(): - target_host = await host_repo_bg.get_by_name(hostname) - if not target_host: - continue - - metrics_create = service.create_metrics_from_parsed( - host_id=target_host.id, - parsed_data=metrics_data, - builtin_id="collect_system_info", - execution_time_ms=result.get("execution_time_ms", 0), - ) - await metrics_repo.create(**metrics_create.model_dump()) - - await session.commit() - - results.append({ - "host": host.name, - "success": ok, - "return_code": return_code, - "error": error_detail, - }) - except Exception as e: - await session.rollback() - logger.exception("[METRICS] Unhandled exception during collection for host=%s", host.name) - results.append({ - "host": host.name, - "success": False, - "error": str(e) - }) - - await ws_manager.broadcast({ - "type": "metrics_collection_complete", - "data": { - "total": len(results), - "success": sum(1 for r in results if r.get("success")), - "failed": sum(1 for r in results if not r.get("success")), - "skipped": skipped, - "results": results, - } - }) - # Lancer en arrière-plan via la boucle asyncio active (retour immédiat) - asyncio.create_task(collect_for_all()) + asyncio.create_task(collect_all_metrics_job()) return { "success": True, diff --git a/app/routes/metrics.py b/app/routes/metrics.py index 475e575..e1f5044 100644 --- a/app/routes/metrics.py +++ b/app/routes/metrics.py @@ -17,6 +17,7 @@ from app.core.config import settings from app.crud.host_metrics import HostMetricsRepository from app.crud.app_setting import AppSettingRepository from app.services import db +from app.services.metrics_collection_scheduler import apply_interval_using_global_scheduler class CollectionScheduleRequest(BaseModel): @@ -123,6 +124,8 @@ async def set_collection_schedule( repo = AppSettingRepository(db_session) await repo.set("metrics_collection_interval", interval) await db_session.commit() + apply_interval_using_global_scheduler(interval) + return {"interval": interval, "message": f"Intervalle de collecte défini à {interval}"} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/routes/terminal.py b/app/routes/terminal.py index 18153d9..473ee68 100644 --- a/app/routes/terminal.py +++ b/app/routes/terminal.py @@ -1523,11 +1523,15 @@ function executeHistoryCommand(index) { " try {\n" " const headers = { 'Content-Type': 'application/json' };\n" " headers['Authorization'] = 'Bearer ' + TOKEN;\n" - " await fetch('/api/terminal/sessions/' + SESSION_ID + '/heartbeat', {\n" + " const resp = await fetch('/api/terminal/sessions/' + SESSION_ID + '/heartbeat', {\n" " method: 'POST',\n" " headers,\n" " body: JSON.stringify({})\n" " });\n" + " if (resp.status === 404 || resp.status === 403) {\n" + " if (heartbeatTimer) { clearInterval(heartbeatTimer); heartbeatTimer = null; }\n" + " console.warn('Session expired or invalid, heartbeat stopped');\n" + " }\n" " } catch (e) {}\n" " }\n\n" " function startHeartbeat() {\n" diff --git a/app/services/metrics_collection_scheduler.py b/app/services/metrics_collection_scheduler.py new file mode 100644 index 0000000..1d921e4 --- /dev/null +++ b/app/services/metrics_collection_scheduler.py @@ -0,0 +1,77 @@ +"""Planification de la collecte automatique des métriques.""" + +from __future__ import annotations + +import logging +from typing import Optional + +from apscheduler.schedulers.asyncio import AsyncIOScheduler + +from app.models.database import async_session_maker + +logger = logging.getLogger(__name__) + + +METRICS_COLLECTION_JOB_ID = "metrics_collect_all" + + +def interval_to_seconds(interval: Optional[str]) -> Optional[int]: + if not interval or interval == "off": + return None + + mapping = { + "5min": 5 * 60, + "15min": 15 * 60, + "30min": 30 * 60, + "1h": 60 * 60, + "6h": 6 * 60 * 60, + "12h": 12 * 60 * 60, + "24h": 24 * 60 * 60, + } + + return mapping.get(interval) + + +def apply_interval(scheduler: AsyncIOScheduler, interval: str) -> None: + seconds = interval_to_seconds(interval) + + existing = scheduler.get_job(METRICS_COLLECTION_JOB_ID) + if seconds is None: + if existing: + scheduler.remove_job(METRICS_COLLECTION_JOB_ID) + logger.info("[METRICS_SCHEDULER] Job removed (interval=off)") + return + + from app.routes.builtin_playbooks import collect_all_metrics_job + + scheduler.add_job( + collect_all_metrics_job, + trigger="interval", + seconds=seconds, + id=METRICS_COLLECTION_JOB_ID, + name="Metrics Collection", + replace_existing=True, + ) + logger.info(f"[METRICS_SCHEDULER] Job scheduled: interval={interval} ({seconds}s)") + + +def apply_interval_using_global_scheduler(interval: str) -> None: + from app.services import scheduler_service + + apply_interval(scheduler_service.scheduler, interval) + + +async def load_and_apply_from_db() -> str: + from app.crud.app_setting import AppSettingRepository + from app.services import scheduler_service + + interval = "off" + async with async_session_maker() as session: + repo = AppSettingRepository(session) + row = await repo.get("metrics_collection_interval") + if row and row.value: + interval = row.value + + logger.info(f"[METRICS_SCHEDULER] Loading from DB: interval={interval}") + apply_interval(scheduler_service.scheduler, interval) + return interval diff --git a/logs/tasks_logs/2025/12/21/task_014100_820ec1_ali2v.xeon.home_Vérification_de_santé_completed.md b/logs/tasks_logs/2025/12/21/task_014100_820ec1_ali2v.xeon.home_Vérification_de_santé_completed.md new file mode 100644 index 0000000..d50dbbd --- /dev/null +++ b/logs/tasks_logs/2025/12/21/task_014100_820ec1_ali2v.xeon.home_Vérification_de_santé_completed.md @@ -0,0 +1,58 @@ +# ✅ Vérification de santé + +## Informations + +| Propriété | Valeur | +|-----------|--------| +| **ID** | `fee8d10e08d4469b82f30d9dbcc3be81` | +| **Nom** | Vérification de santé | +| **Cible** | `ali2v.xeon.home` | +| **Statut** | completed | +| **Type** | Manuel | +| **Progression** | 100% | +| **Début** | 2025-12-22T01:40:53.655836+00:00 | +| **Fin** | 2025-12-22T01:41:00.681248+00:00 | +| **Durée** | 7.0s | + +## Sortie + +``` +Using /mnt/c/dev/git/python/homelab-automation-api-v2/ansible/ansible.cfg as config file + +PLAY [Health check on target host] ********************************************* + +TASK [Check if host is reachable (ping)] *************************************** +ok: [ali2v.xeon.home] => {"changed": false, "ping": "pong"} + +TASK [Gather minimal facts] **************************************************** +ok: [ali2v.xeon.home] + +TASK [Get system uptime] ******************************************************* +ok: [ali2v.xeon.home] => {"changed": false, "cmd": ["uptime"], "delta": "0:00:00.002844", "end": "2025-12-21 20:40:58.450286", "msg": "", "rc": 0, "start": "2025-12-21 20:40:58.447442", "stderr": "", "stderr_lines": [], "stdout": " 20:40:58 up 1 day, 21:40, 1 user, load average: 0.42, 0.39, 0.36", "stdout_lines": [" 20:40:58 up 1 day, 21:40, 1 user, load average: 0.42, 0.39, 0.36"]} + +TASK [Get disk usage] ********************************************************** +ok: [ali2v.xeon.home] => {"changed": false, "cmd": "df -h / | tail -1 | awk '{print $5}'", "delta": "0:00:00.004091", "end": "2025-12-21 20:40:58.905596", "msg": "", "rc": 0, "start": "2025-12-21 20:40:58.901505", "stderr": "", "stderr_lines": [], "stdout": "22%", "stdout_lines": ["22%"]} + +TASK [Get memory usage (Linux)] ************************************************ +ok: [ali2v.xeon.home] => {"changed": false, "cmd": "if command -v free >/dev/null 2>&1; then\n free -m | grep Mem | awk '{printf \"%.1f%%\", $3/$2 * 100}'\nelse\n # Fallback for systems without free command\n cat /proc/meminfo | awk '/MemTotal/{total=$2} /MemAvailable/{avail=$2} END{printf \"%.1f%%\", (total-avail)/total*100}'\nfi\n", "delta": "0:00:00.004129", "end": "2025-12-21 20:40:59.332276", "msg": "", "rc": 0, "start": "2025-12-21 20:40:59.328147", "stderr": "", "stderr_lines": [], "stdout": "20.5%", "stdout_lines": ["20.5%"]} + +TASK [Get CPU temperature (ARM/SBC)] ******************************************* +ok: [ali2v.xeon.home] => {"changed": false, "cmd": "if [ -f /sys/class/thermal/thermal_zone0/temp ]; then\n temp=$(cat /sys/class/thermal/thermal_zone0/temp)\n # Use awk instead of bc for better compatibility\n echo \"${temp}\" | awk '{printf \"%.1f°C\", $1/1000}'\nelse\n echo \"N/A\"\nfi\n", "delta": "0:00:00.004757", "end": "2025-12-21 20:40:59.742177", "msg": "", "rc": 0, "start": "2025-12-21 20:40:59.737420", "stderr": "", "stderr_lines": [], "stdout": "47.0°C", "stdout_lines": ["47.0°C"]} + +TASK [Get CPU load] ************************************************************ +ok: [ali2v.xeon.home] => {"changed": false, "cmd": "if [ -f /proc/loadavg ]; then\n cat /proc/loadavg | awk '{print $1}'\nelse\n uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' '\nfi\n", "delta": "0:00:00.004205", "end": "2025-12-21 20:41:00.154244", "msg": "", "rc": 0, "start": "2025-12-21 20:41:00.150039", "stderr": "", "stderr_lines": [], "stdout": "0.42", "stdout_lines": ["0.42"]} + +TASK [Display health status] *************************************************** +ok: [ali2v.xeon.home] => { + "msg": "═══════════════════════════════════════\nHost: ali2v.xeon.home\nStatus: OK\n═══════════════════════════════════════\nUptime: 20:40:58 up 1 day, 21:40, 1 user, load average: 0.42, 0.39, 0.36\nDisk Usage: 22%\nMemory Usage: 20.5%\nCPU Load: 0.42\nCPU Temp: 47.0°C\n═══════════════════════════════════════\n" +} + +PLAY RECAP ********************************************************************* +ali2v.xeon.home : ok=8 changed=0 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + + +``` + +--- +*Généré automatiquement par Homelab Automation Dashboard* +*Date: 2025-12-22T01:41:00.705253+00:00* diff --git a/tests/backend/test_routes_metrics.py b/tests/backend/test_routes_metrics.py index 9f0275a..552c113 100644 --- a/tests/backend/test_routes_metrics.py +++ b/tests/backend/test_routes_metrics.py @@ -105,6 +105,21 @@ class TestSetCollectionSchedule: data = response.json() assert data["interval"] == "30min" + @pytest.mark.asyncio + async def test_set_schedule_triggers_reschedule(self, client: AsyncClient): + """Le changement d'intervalle doit rescheduler le job de collecte.""" + with patch( + "app.routes.metrics.apply_interval_using_global_scheduler", + autospec=True, + ) as mock_apply: + response = await client.post( + "/api/metrics/collection-schedule", + json={"interval": "5min"} + ) + + assert response.status_code == 200 + mock_apply.assert_called_once_with("5min") + @pytest.mark.asyncio async def test_set_schedule_off(self, client: AsyncClient): """Désactivation de la collecte."""