Some checks failed
Tests / Backend Tests (Python) (3.10) (push) Has been cancelled
Tests / Backend Tests (Python) (3.11) (push) Has been cancelled
Tests / Backend Tests (Python) (3.12) (push) Has been cancelled
Tests / Frontend Tests (JS) (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / All Tests Passed (push) Has been cancelled
386 lines
14 KiB
Python
386 lines
14 KiB
Python
"""Docker alerts service.
|
|
|
|
Handles detection of container issues and alert notifications.
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime, timezone, timedelta
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from app.models.database import async_session_maker
|
|
from app.crud.host import HostRepository
|
|
from app.crud.docker_container import DockerContainerRepository
|
|
from app.crud.docker_alert import DockerAlertRepository
|
|
from app.services.notification_service import notification_service
|
|
from app.services.websocket_service import ws_manager
|
|
|
|
logger = logging.getLogger("homelab.docker.alerts")
|
|
|
|
# Cooldown period between notifications for the same alert (5 minutes)
|
|
NOTIFICATION_COOLDOWN_SECONDS = 300
|
|
|
|
|
|
class DockerAlertsService:
|
|
"""Service for detecting Docker container issues and managing alerts."""
|
|
|
|
def __init__(self):
|
|
self._last_check: Optional[datetime] = None
|
|
|
|
async def check_all_alerts(self) -> Dict[str, Any]:
|
|
"""Check all Docker-enabled hosts for container issues.
|
|
|
|
Returns:
|
|
Summary of alerts opened/closed
|
|
"""
|
|
result = {
|
|
"hosts_checked": 0,
|
|
"alerts_opened": 0,
|
|
"alerts_closed": 0,
|
|
"notifications_sent": 0
|
|
}
|
|
|
|
async with async_session_maker() as session:
|
|
host_repo = HostRepository(session)
|
|
container_repo = DockerContainerRepository(session)
|
|
alert_repo = DockerAlertRepository(session)
|
|
|
|
# Get all Docker-enabled hosts
|
|
hosts = await host_repo.list_docker_enabled()
|
|
result["hosts_checked"] = len(hosts)
|
|
|
|
for host in hosts:
|
|
try:
|
|
# Get all containers for this host
|
|
containers = await container_repo.list_by_host(host.id)
|
|
|
|
for container in containers:
|
|
await self._check_container(
|
|
session, host, container, alert_repo, result
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking alerts for host {host.id}: {e}")
|
|
|
|
await session.commit()
|
|
|
|
self._last_check = datetime.now(timezone.utc)
|
|
return result
|
|
|
|
async def _check_container(
|
|
self,
|
|
session,
|
|
host,
|
|
container,
|
|
alert_repo: DockerAlertRepository,
|
|
result: Dict[str, Any]
|
|
) -> None:
|
|
"""Check a single container for issues."""
|
|
labels = container.labels or {}
|
|
|
|
# Only monitor containers with homelab.monitor=true label
|
|
should_monitor = labels.get("homelab.monitor", "").lower() == "true"
|
|
if not should_monitor:
|
|
return
|
|
|
|
expected_state = labels.get("homelab.desired", "running")
|
|
current_state = container.state
|
|
health = container.health
|
|
|
|
alert_needed = False
|
|
severity = "warning"
|
|
message = None
|
|
|
|
# Case 1: Container should be running but isn't
|
|
if expected_state == "running" and current_state != "running":
|
|
alert_needed = True
|
|
severity = "error"
|
|
message = f"Container {container.name} is {current_state}, expected running"
|
|
|
|
# Case 2: Container is unhealthy
|
|
elif health == "unhealthy":
|
|
alert_needed = True
|
|
severity = "warning"
|
|
message = f"Container {container.name} health check failing"
|
|
|
|
# Case 3: Container is healthy and running - close any open alerts
|
|
elif current_state == "running" and health in ("healthy", None, "none"):
|
|
closed = await alert_repo.close_for_container(host.id, container.name)
|
|
if closed > 0:
|
|
result["alerts_closed"] += closed
|
|
logger.info(f"Closed {closed} alert(s) for {container.name} on {host.name}")
|
|
|
|
# Broadcast alert closure
|
|
await ws_manager.broadcast({
|
|
"type": "docker_alert_closed",
|
|
"data": {
|
|
"host_id": host.id,
|
|
"host_name": host.name,
|
|
"container_name": container.name
|
|
}
|
|
})
|
|
return
|
|
|
|
if alert_needed:
|
|
await self._open_or_update_alert(
|
|
session, host, container.name, severity, message, alert_repo, result
|
|
)
|
|
|
|
async def _open_or_update_alert(
|
|
self,
|
|
session,
|
|
host,
|
|
container_name: str,
|
|
severity: str,
|
|
message: str,
|
|
alert_repo: DockerAlertRepository,
|
|
result: Dict[str, Any]
|
|
) -> None:
|
|
"""Open a new alert or update existing one."""
|
|
# Check for existing open alert
|
|
existing = await alert_repo.get_open_alert(host.id, container_name)
|
|
|
|
if existing:
|
|
# Check if we should send another notification
|
|
should_notify = False
|
|
if existing.last_notified_at:
|
|
cooldown_elapsed = (
|
|
datetime.now(timezone.utc) - existing.last_notified_at.replace(tzinfo=timezone.utc)
|
|
).total_seconds() > NOTIFICATION_COOLDOWN_SECONDS
|
|
should_notify = cooldown_elapsed
|
|
else:
|
|
should_notify = True
|
|
|
|
if should_notify:
|
|
await self._send_notification(host, container_name, severity, message)
|
|
await alert_repo.update_last_notified(existing.id)
|
|
result["notifications_sent"] += 1
|
|
else:
|
|
# Create new alert
|
|
alert = await alert_repo.create(
|
|
host_id=host.id,
|
|
container_name=container_name,
|
|
severity=severity,
|
|
message=message
|
|
)
|
|
result["alerts_opened"] += 1
|
|
logger.info(f"Opened alert for {container_name} on {host.name}: {message}")
|
|
|
|
# Send notification
|
|
await self._send_notification(host, container_name, severity, message)
|
|
await alert_repo.update_last_notified(alert.id)
|
|
result["notifications_sent"] += 1
|
|
|
|
# Broadcast new alert via WebSocket
|
|
await ws_manager.broadcast({
|
|
"type": "docker_alert_opened",
|
|
"data": {
|
|
"id": alert.id,
|
|
"host_id": host.id,
|
|
"host_name": host.name,
|
|
"container_name": container_name,
|
|
"severity": severity,
|
|
"message": message,
|
|
"opened_at": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
})
|
|
|
|
async def _send_notification(
|
|
self,
|
|
host,
|
|
container_name: str,
|
|
severity: str,
|
|
message: str
|
|
) -> None:
|
|
"""Send ntfy notification for an alert."""
|
|
try:
|
|
priority_map = {
|
|
"warning": 3,
|
|
"error": 4,
|
|
"critical": 5
|
|
}
|
|
priority = priority_map.get(severity, 3)
|
|
|
|
emoji = "🚨" if severity == "critical" else "⚠️" if severity == "error" else "🔔"
|
|
|
|
await notification_service.send(
|
|
topic="homelab-docker",
|
|
title=f"{emoji} Docker Alert - {host.name}",
|
|
message=f"{container_name}: {message}",
|
|
priority=priority,
|
|
tags=["docker", severity]
|
|
)
|
|
logger.info(f"Sent notification for {container_name} on {host.name}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to send notification: {e}")
|
|
|
|
async def acknowledge_alert(
|
|
self,
|
|
alert_id: int,
|
|
acknowledged_by: str,
|
|
note: Optional[str] = None
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Acknowledge an alert."""
|
|
async with async_session_maker() as session:
|
|
alert_repo = DockerAlertRepository(session)
|
|
host_repo = HostRepository(session)
|
|
|
|
alert = await alert_repo.acknowledge(alert_id, acknowledged_by)
|
|
|
|
if alert:
|
|
host = await host_repo.get(alert.host_id)
|
|
await session.commit()
|
|
|
|
# Broadcast acknowledgement
|
|
await ws_manager.broadcast({
|
|
"type": "docker_alert_acknowledged",
|
|
"data": {
|
|
"id": alert.id,
|
|
"host_id": alert.host_id,
|
|
"host_name": host.name if host else "",
|
|
"container_name": alert.container_name,
|
|
"acknowledged_by": acknowledged_by,
|
|
"acknowledged_at": alert.acknowledged_at.isoformat() if alert.acknowledged_at else None
|
|
}
|
|
})
|
|
|
|
return alert.to_dict()
|
|
|
|
return None
|
|
|
|
async def close_alert(self, alert_id: int) -> Optional[Dict[str, Any]]:
|
|
"""Manually close an alert."""
|
|
async with async_session_maker() as session:
|
|
alert_repo = DockerAlertRepository(session)
|
|
host_repo = HostRepository(session)
|
|
|
|
alert = await alert_repo.close(alert_id)
|
|
|
|
if alert:
|
|
host = await host_repo.get(alert.host_id)
|
|
await session.commit()
|
|
|
|
# Broadcast closure
|
|
await ws_manager.broadcast({
|
|
"type": "docker_alert_closed",
|
|
"data": {
|
|
"id": alert.id,
|
|
"host_id": alert.host_id,
|
|
"host_name": host.name if host else "",
|
|
"container_name": alert.container_name
|
|
}
|
|
})
|
|
|
|
return alert.to_dict()
|
|
|
|
return None
|
|
|
|
async def get_alerts(
|
|
self,
|
|
host_id: Optional[str] = None,
|
|
state: Optional[str] = None,
|
|
severity: Optional[str] = None,
|
|
limit: int = 100,
|
|
offset: int = 0
|
|
) -> Dict[str, Any]:
|
|
"""Get alerts with optional filters."""
|
|
async with async_session_maker() as session:
|
|
alert_repo = DockerAlertRepository(session)
|
|
host_repo = HostRepository(session)
|
|
|
|
alerts = await alert_repo.list_alerts(
|
|
host_id=host_id,
|
|
state=state,
|
|
severity=severity,
|
|
limit=limit,
|
|
offset=offset
|
|
)
|
|
|
|
counts = await alert_repo.count_alerts(host_id=host_id)
|
|
|
|
# Enrich with host names
|
|
result = []
|
|
for alert in alerts:
|
|
alert_dict = alert.to_dict()
|
|
host = await host_repo.get(alert.host_id)
|
|
alert_dict["host_name"] = host.name if host else ""
|
|
result.append(alert_dict)
|
|
|
|
return {
|
|
"alerts": result,
|
|
"total": counts["total"],
|
|
"open_count": counts["open"],
|
|
"acknowledged_count": counts["acknowledged"]
|
|
}
|
|
|
|
async def get_stats(self) -> Dict[str, Any]:
|
|
"""Get global Docker statistics."""
|
|
async with async_session_maker() as session:
|
|
from sqlalchemy import select, func
|
|
from app.models import Host, DockerContainer, DockerImage, DockerVolume, DockerAlert
|
|
|
|
# Count hosts
|
|
total_hosts = await session.execute(
|
|
select(func.count(Host.id)).where(Host.deleted_at.is_(None))
|
|
)
|
|
enabled_hosts = await session.execute(
|
|
select(func.count(Host.id)).where(
|
|
Host.deleted_at.is_(None),
|
|
Host.docker_enabled == True
|
|
)
|
|
)
|
|
online_hosts = await session.execute(
|
|
select(func.count(Host.id)).where(
|
|
Host.deleted_at.is_(None),
|
|
Host.docker_enabled == True,
|
|
Host.docker_status == "online"
|
|
)
|
|
)
|
|
|
|
# Count containers
|
|
total_containers = await session.execute(
|
|
select(func.count(DockerContainer.id))
|
|
)
|
|
running_containers = await session.execute(
|
|
select(func.count(DockerContainer.id)).where(
|
|
DockerContainer.state == "running"
|
|
)
|
|
)
|
|
|
|
# Count images and volumes
|
|
total_images = await session.execute(
|
|
select(func.count(DockerImage.id))
|
|
)
|
|
total_volumes = await session.execute(
|
|
select(func.count(DockerVolume.id))
|
|
)
|
|
|
|
# Count open alerts
|
|
open_alerts = await session.execute(
|
|
select(func.count(DockerAlert.id)).where(
|
|
DockerAlert.state == "open"
|
|
)
|
|
)
|
|
|
|
# Last collection time
|
|
last_collect = await session.execute(
|
|
select(func.max(Host.docker_last_collect_at)).where(
|
|
Host.docker_enabled == True
|
|
)
|
|
)
|
|
|
|
return {
|
|
"total_hosts": total_hosts.scalar() or 0,
|
|
"enabled_hosts": enabled_hosts.scalar() or 0,
|
|
"online_hosts": online_hosts.scalar() or 0,
|
|
"total_containers": total_containers.scalar() or 0,
|
|
"running_containers": running_containers.scalar() or 0,
|
|
"total_images": total_images.scalar() or 0,
|
|
"total_volumes": total_volumes.scalar() or 0,
|
|
"open_alerts": open_alerts.scalar() or 0,
|
|
"last_collection": last_collect.scalar()
|
|
}
|
|
|
|
|
|
# Singleton instance
|
|
docker_alerts_service = DockerAlertsService()
|