"""Docker alerts service. Handles detection of container issues and alert notifications. """ import asyncio import logging from datetime import datetime, timezone, timedelta from typing import Dict, Any, List, Optional from app.models.database import async_session_maker from app.crud.host import HostRepository from app.crud.docker_container import DockerContainerRepository from app.crud.docker_alert import DockerAlertRepository from app.services.notification_service import notification_service from app.services.websocket_service import ws_manager logger = logging.getLogger("homelab.docker.alerts") # Cooldown period between notifications for the same alert (5 minutes) NOTIFICATION_COOLDOWN_SECONDS = 300 class DockerAlertsService: """Service for detecting Docker container issues and managing alerts.""" def __init__(self): self._last_check: Optional[datetime] = None async def check_all_alerts(self) -> Dict[str, Any]: """Check all Docker-enabled hosts for container issues. Returns: Summary of alerts opened/closed """ result = { "hosts_checked": 0, "alerts_opened": 0, "alerts_closed": 0, "notifications_sent": 0 } async with async_session_maker() as session: host_repo = HostRepository(session) container_repo = DockerContainerRepository(session) alert_repo = DockerAlertRepository(session) # Get all Docker-enabled hosts hosts = await host_repo.list_docker_enabled() result["hosts_checked"] = len(hosts) for host in hosts: try: # Get all containers for this host containers = await container_repo.list_by_host(host.id) for container in containers: await self._check_container( session, host, container, alert_repo, result ) except Exception as e: logger.error(f"Error checking alerts for host {host.id}: {e}") await session.commit() self._last_check = datetime.now(timezone.utc) return result async def _check_container( self, session, host, container, alert_repo: DockerAlertRepository, result: Dict[str, Any] ) -> None: """Check a single container for issues.""" labels = container.labels or {} # Only monitor containers with homelab.monitor=true label should_monitor = labels.get("homelab.monitor", "").lower() == "true" if not should_monitor: return expected_state = labels.get("homelab.desired", "running") current_state = container.state health = container.health alert_needed = False severity = "warning" message = None # Case 1: Container should be running but isn't if expected_state == "running" and current_state != "running": alert_needed = True severity = "error" message = f"Container {container.name} is {current_state}, expected running" # Case 2: Container is unhealthy elif health == "unhealthy": alert_needed = True severity = "warning" message = f"Container {container.name} health check failing" # Case 3: Container is healthy and running - close any open alerts elif current_state == "running" and health in ("healthy", None, "none"): closed = await alert_repo.close_for_container(host.id, container.name) if closed > 0: result["alerts_closed"] += closed logger.info(f"Closed {closed} alert(s) for {container.name} on {host.name}") # Broadcast alert closure await ws_manager.broadcast({ "type": "docker_alert_closed", "data": { "host_id": host.id, "host_name": host.name, "container_name": container.name } }) return if alert_needed: await self._open_or_update_alert( session, host, container.name, severity, message, alert_repo, result ) async def _open_or_update_alert( self, session, host, container_name: str, severity: str, message: str, alert_repo: DockerAlertRepository, result: Dict[str, Any] ) -> None: """Open a new alert or update existing one.""" # Check for existing open alert existing = await alert_repo.get_open_alert(host.id, container_name) if existing: # Check if we should send another notification should_notify = False if existing.last_notified_at: cooldown_elapsed = ( datetime.now(timezone.utc) - existing.last_notified_at.replace(tzinfo=timezone.utc) ).total_seconds() > NOTIFICATION_COOLDOWN_SECONDS should_notify = cooldown_elapsed else: should_notify = True if should_notify: await self._send_notification(host, container_name, severity, message) await alert_repo.update_last_notified(existing.id) result["notifications_sent"] += 1 else: # Create new alert alert = await alert_repo.create( host_id=host.id, container_name=container_name, severity=severity, message=message ) result["alerts_opened"] += 1 logger.info(f"Opened alert for {container_name} on {host.name}: {message}") # Send notification await self._send_notification(host, container_name, severity, message) await alert_repo.update_last_notified(alert.id) result["notifications_sent"] += 1 # Broadcast new alert via WebSocket await ws_manager.broadcast({ "type": "docker_alert_opened", "data": { "id": alert.id, "host_id": host.id, "host_name": host.name, "container_name": container_name, "severity": severity, "message": message, "opened_at": datetime.now(timezone.utc).isoformat() } }) async def _send_notification( self, host, container_name: str, severity: str, message: str ) -> None: """Send ntfy notification for an alert.""" try: priority_map = { "warning": 3, "error": 4, "critical": 5 } priority = priority_map.get(severity, 3) emoji = "🚨" if severity == "critical" else "⚠️" if severity == "error" else "🔔" await notification_service.send( topic="homelab-docker", title=f"{emoji} Docker Alert - {host.name}", message=f"{container_name}: {message}", priority=priority, tags=["docker", severity] ) logger.info(f"Sent notification for {container_name} on {host.name}") except Exception as e: logger.error(f"Failed to send notification: {e}") async def acknowledge_alert( self, alert_id: int, acknowledged_by: str, note: Optional[str] = None ) -> Optional[Dict[str, Any]]: """Acknowledge an alert.""" async with async_session_maker() as session: alert_repo = DockerAlertRepository(session) host_repo = HostRepository(session) alert = await alert_repo.acknowledge(alert_id, acknowledged_by) if alert: host = await host_repo.get(alert.host_id) await session.commit() # Broadcast acknowledgement await ws_manager.broadcast({ "type": "docker_alert_acknowledged", "data": { "id": alert.id, "host_id": alert.host_id, "host_name": host.name if host else "", "container_name": alert.container_name, "acknowledged_by": acknowledged_by, "acknowledged_at": alert.acknowledged_at.isoformat() if alert.acknowledged_at else None } }) return alert.to_dict() return None async def close_alert(self, alert_id: int) -> Optional[Dict[str, Any]]: """Manually close an alert.""" async with async_session_maker() as session: alert_repo = DockerAlertRepository(session) host_repo = HostRepository(session) alert = await alert_repo.close(alert_id) if alert: host = await host_repo.get(alert.host_id) await session.commit() # Broadcast closure await ws_manager.broadcast({ "type": "docker_alert_closed", "data": { "id": alert.id, "host_id": alert.host_id, "host_name": host.name if host else "", "container_name": alert.container_name } }) return alert.to_dict() return None async def get_alerts( self, host_id: Optional[str] = None, state: Optional[str] = None, severity: Optional[str] = None, limit: int = 100, offset: int = 0 ) -> Dict[str, Any]: """Get alerts with optional filters.""" async with async_session_maker() as session: alert_repo = DockerAlertRepository(session) host_repo = HostRepository(session) alerts = await alert_repo.list_alerts( host_id=host_id, state=state, severity=severity, limit=limit, offset=offset ) counts = await alert_repo.count_alerts(host_id=host_id) # Enrich with host names result = [] for alert in alerts: alert_dict = alert.to_dict() host = await host_repo.get(alert.host_id) alert_dict["host_name"] = host.name if host else "" result.append(alert_dict) return { "alerts": result, "total": counts["total"], "open_count": counts["open"], "acknowledged_count": counts["acknowledged"] } async def get_stats(self) -> Dict[str, Any]: """Get global Docker statistics.""" async with async_session_maker() as session: from sqlalchemy import select, func from app.models import Host, DockerContainer, DockerImage, DockerVolume, DockerAlert # Count hosts total_hosts = await session.execute( select(func.count(Host.id)).where(Host.deleted_at.is_(None)) ) enabled_hosts = await session.execute( select(func.count(Host.id)).where( Host.deleted_at.is_(None), Host.docker_enabled == True ) ) online_hosts = await session.execute( select(func.count(Host.id)).where( Host.deleted_at.is_(None), Host.docker_enabled == True, Host.docker_status == "online" ) ) # Count containers total_containers = await session.execute( select(func.count(DockerContainer.id)) ) running_containers = await session.execute( select(func.count(DockerContainer.id)).where( DockerContainer.state == "running" ) ) # Count images and volumes total_images = await session.execute( select(func.count(DockerImage.id)) ) total_volumes = await session.execute( select(func.count(DockerVolume.id)) ) # Count open alerts open_alerts = await session.execute( select(func.count(DockerAlert.id)).where( DockerAlert.state == "open" ) ) # Last collection time last_collect = await session.execute( select(func.max(Host.docker_last_collect_at)).where( Host.docker_enabled == True ) ) return { "total_hosts": total_hosts.scalar() or 0, "enabled_hosts": enabled_hosts.scalar() or 0, "online_hosts": online_hosts.scalar() or 0, "total_containers": total_containers.scalar() or 0, "running_containers": running_containers.scalar() or 0, "total_images": total_images.scalar() or 0, "total_volumes": total_volumes.scalar() or 0, "open_alerts": open_alerts.scalar() or 0, "last_collection": last_collect.scalar() } # Singleton instance docker_alerts_service = DockerAlertsService()