homelab_automation/app/services/docker_alerts.py
Bruno Charest 68a9b0f390
Some checks failed
Tests / Backend Tests (Python) (3.10) (push) Has been cancelled
Tests / Backend Tests (Python) (3.11) (push) Has been cancelled
Tests / Backend Tests (Python) (3.12) (push) Has been cancelled
Tests / Frontend Tests (JS) (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / All Tests Passed (push) Has been cancelled
Remove Node.js cache files containing npm vulnerability data for vitest and vite packages
2025-12-15 20:36:06 -05:00

386 lines
14 KiB
Python

"""Docker alerts service.
Handles detection of container issues and alert notifications.
"""
import asyncio
import logging
from datetime import datetime, timezone, timedelta
from typing import Dict, Any, List, Optional
from app.models.database import async_session_maker
from app.crud.host import HostRepository
from app.crud.docker_container import DockerContainerRepository
from app.crud.docker_alert import DockerAlertRepository
from app.services.notification_service import notification_service
from app.services.websocket_service import ws_manager
logger = logging.getLogger("homelab.docker.alerts")
# Cooldown period between notifications for the same alert (5 minutes)
NOTIFICATION_COOLDOWN_SECONDS = 300
class DockerAlertsService:
"""Service for detecting Docker container issues and managing alerts."""
def __init__(self):
self._last_check: Optional[datetime] = None
async def check_all_alerts(self) -> Dict[str, Any]:
"""Check all Docker-enabled hosts for container issues.
Returns:
Summary of alerts opened/closed
"""
result = {
"hosts_checked": 0,
"alerts_opened": 0,
"alerts_closed": 0,
"notifications_sent": 0
}
async with async_session_maker() as session:
host_repo = HostRepository(session)
container_repo = DockerContainerRepository(session)
alert_repo = DockerAlertRepository(session)
# Get all Docker-enabled hosts
hosts = await host_repo.list_docker_enabled()
result["hosts_checked"] = len(hosts)
for host in hosts:
try:
# Get all containers for this host
containers = await container_repo.list_by_host(host.id)
for container in containers:
await self._check_container(
session, host, container, alert_repo, result
)
except Exception as e:
logger.error(f"Error checking alerts for host {host.id}: {e}")
await session.commit()
self._last_check = datetime.now(timezone.utc)
return result
async def _check_container(
self,
session,
host,
container,
alert_repo: DockerAlertRepository,
result: Dict[str, Any]
) -> None:
"""Check a single container for issues."""
labels = container.labels or {}
# Only monitor containers with homelab.monitor=true label
should_monitor = labels.get("homelab.monitor", "").lower() == "true"
if not should_monitor:
return
expected_state = labels.get("homelab.desired", "running")
current_state = container.state
health = container.health
alert_needed = False
severity = "warning"
message = None
# Case 1: Container should be running but isn't
if expected_state == "running" and current_state != "running":
alert_needed = True
severity = "error"
message = f"Container {container.name} is {current_state}, expected running"
# Case 2: Container is unhealthy
elif health == "unhealthy":
alert_needed = True
severity = "warning"
message = f"Container {container.name} health check failing"
# Case 3: Container is healthy and running - close any open alerts
elif current_state == "running" and health in ("healthy", None, "none"):
closed = await alert_repo.close_for_container(host.id, container.name)
if closed > 0:
result["alerts_closed"] += closed
logger.info(f"Closed {closed} alert(s) for {container.name} on {host.name}")
# Broadcast alert closure
await ws_manager.broadcast({
"type": "docker_alert_closed",
"data": {
"host_id": host.id,
"host_name": host.name,
"container_name": container.name
}
})
return
if alert_needed:
await self._open_or_update_alert(
session, host, container.name, severity, message, alert_repo, result
)
async def _open_or_update_alert(
self,
session,
host,
container_name: str,
severity: str,
message: str,
alert_repo: DockerAlertRepository,
result: Dict[str, Any]
) -> None:
"""Open a new alert or update existing one."""
# Check for existing open alert
existing = await alert_repo.get_open_alert(host.id, container_name)
if existing:
# Check if we should send another notification
should_notify = False
if existing.last_notified_at:
cooldown_elapsed = (
datetime.now(timezone.utc) - existing.last_notified_at.replace(tzinfo=timezone.utc)
).total_seconds() > NOTIFICATION_COOLDOWN_SECONDS
should_notify = cooldown_elapsed
else:
should_notify = True
if should_notify:
await self._send_notification(host, container_name, severity, message)
await alert_repo.update_last_notified(existing.id)
result["notifications_sent"] += 1
else:
# Create new alert
alert = await alert_repo.create(
host_id=host.id,
container_name=container_name,
severity=severity,
message=message
)
result["alerts_opened"] += 1
logger.info(f"Opened alert for {container_name} on {host.name}: {message}")
# Send notification
await self._send_notification(host, container_name, severity, message)
await alert_repo.update_last_notified(alert.id)
result["notifications_sent"] += 1
# Broadcast new alert via WebSocket
await ws_manager.broadcast({
"type": "docker_alert_opened",
"data": {
"id": alert.id,
"host_id": host.id,
"host_name": host.name,
"container_name": container_name,
"severity": severity,
"message": message,
"opened_at": datetime.now(timezone.utc).isoformat()
}
})
async def _send_notification(
self,
host,
container_name: str,
severity: str,
message: str
) -> None:
"""Send ntfy notification for an alert."""
try:
priority_map = {
"warning": 3,
"error": 4,
"critical": 5
}
priority = priority_map.get(severity, 3)
emoji = "🚨" if severity == "critical" else "⚠️" if severity == "error" else "🔔"
await notification_service.send(
topic="homelab-docker",
title=f"{emoji} Docker Alert - {host.name}",
message=f"{container_name}: {message}",
priority=priority,
tags=["docker", severity]
)
logger.info(f"Sent notification for {container_name} on {host.name}")
except Exception as e:
logger.error(f"Failed to send notification: {e}")
async def acknowledge_alert(
self,
alert_id: int,
acknowledged_by: str,
note: Optional[str] = None
) -> Optional[Dict[str, Any]]:
"""Acknowledge an alert."""
async with async_session_maker() as session:
alert_repo = DockerAlertRepository(session)
host_repo = HostRepository(session)
alert = await alert_repo.acknowledge(alert_id, acknowledged_by)
if alert:
host = await host_repo.get(alert.host_id)
await session.commit()
# Broadcast acknowledgement
await ws_manager.broadcast({
"type": "docker_alert_acknowledged",
"data": {
"id": alert.id,
"host_id": alert.host_id,
"host_name": host.name if host else "",
"container_name": alert.container_name,
"acknowledged_by": acknowledged_by,
"acknowledged_at": alert.acknowledged_at.isoformat() if alert.acknowledged_at else None
}
})
return alert.to_dict()
return None
async def close_alert(self, alert_id: int) -> Optional[Dict[str, Any]]:
"""Manually close an alert."""
async with async_session_maker() as session:
alert_repo = DockerAlertRepository(session)
host_repo = HostRepository(session)
alert = await alert_repo.close(alert_id)
if alert:
host = await host_repo.get(alert.host_id)
await session.commit()
# Broadcast closure
await ws_manager.broadcast({
"type": "docker_alert_closed",
"data": {
"id": alert.id,
"host_id": alert.host_id,
"host_name": host.name if host else "",
"container_name": alert.container_name
}
})
return alert.to_dict()
return None
async def get_alerts(
self,
host_id: Optional[str] = None,
state: Optional[str] = None,
severity: Optional[str] = None,
limit: int = 100,
offset: int = 0
) -> Dict[str, Any]:
"""Get alerts with optional filters."""
async with async_session_maker() as session:
alert_repo = DockerAlertRepository(session)
host_repo = HostRepository(session)
alerts = await alert_repo.list_alerts(
host_id=host_id,
state=state,
severity=severity,
limit=limit,
offset=offset
)
counts = await alert_repo.count_alerts(host_id=host_id)
# Enrich with host names
result = []
for alert in alerts:
alert_dict = alert.to_dict()
host = await host_repo.get(alert.host_id)
alert_dict["host_name"] = host.name if host else ""
result.append(alert_dict)
return {
"alerts": result,
"total": counts["total"],
"open_count": counts["open"],
"acknowledged_count": counts["acknowledged"]
}
async def get_stats(self) -> Dict[str, Any]:
"""Get global Docker statistics."""
async with async_session_maker() as session:
from sqlalchemy import select, func
from app.models import Host, DockerContainer, DockerImage, DockerVolume, DockerAlert
# Count hosts
total_hosts = await session.execute(
select(func.count(Host.id)).where(Host.deleted_at.is_(None))
)
enabled_hosts = await session.execute(
select(func.count(Host.id)).where(
Host.deleted_at.is_(None),
Host.docker_enabled == True
)
)
online_hosts = await session.execute(
select(func.count(Host.id)).where(
Host.deleted_at.is_(None),
Host.docker_enabled == True,
Host.docker_status == "online"
)
)
# Count containers
total_containers = await session.execute(
select(func.count(DockerContainer.id))
)
running_containers = await session.execute(
select(func.count(DockerContainer.id)).where(
DockerContainer.state == "running"
)
)
# Count images and volumes
total_images = await session.execute(
select(func.count(DockerImage.id))
)
total_volumes = await session.execute(
select(func.count(DockerVolume.id))
)
# Count open alerts
open_alerts = await session.execute(
select(func.count(DockerAlert.id)).where(
DockerAlert.state == "open"
)
)
# Last collection time
last_collect = await session.execute(
select(func.max(Host.docker_last_collect_at)).where(
Host.docker_enabled == True
)
)
return {
"total_hosts": total_hosts.scalar() or 0,
"enabled_hosts": enabled_hosts.scalar() or 0,
"online_hosts": online_hosts.scalar() or 0,
"total_containers": total_containers.scalar() or 0,
"running_containers": running_containers.scalar() or 0,
"total_images": total_images.scalar() or 0,
"total_volumes": total_volumes.scalar() or 0,
"open_alerts": open_alerts.scalar() or 0,
"last_collection": last_collect.scalar()
}
# Singleton instance
docker_alerts_service = DockerAlertsService()