homelab_automation/app/services/docker_service.py
Bruno Charest 68a9b0f390
Some checks failed
Tests / Backend Tests (Python) (3.10) (push) Has been cancelled
Tests / Backend Tests (Python) (3.11) (push) Has been cancelled
Tests / Backend Tests (Python) (3.12) (push) Has been cancelled
Tests / Frontend Tests (JS) (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / All Tests Passed (push) Has been cancelled
Remove Node.js cache files containing npm vulnerability data for vitest and vite packages
2025-12-15 20:36:06 -05:00

611 lines
26 KiB
Python

"""Docker collection service using SSH.
Collects Docker information from hosts via SSH commands.
"""
from __future__ import annotations
import asyncio
import json
import logging
import sys
import time
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any, Tuple
try:
import asyncssh
_asyncssh_import_error: Optional[BaseException] = None
except ModuleNotFoundError as e: # pragma: no cover
asyncssh = None
_asyncssh_import_error = e
from app.core.config import settings
from app.models.database import async_session_maker
from app.crud.host import HostRepository
from app.crud.docker_container import DockerContainerRepository
from app.crud.docker_image import DockerImageRepository
from app.crud.docker_volume import DockerVolumeRepository
from app.crud.docker_alert import DockerAlertRepository
from app.services.ansible_service import ansible_service
logger = logging.getLogger("homelab.docker")
# Log prefix for standardized output
LOG_PREFIX = "[DOCKER]"
class SSHConnectionError(Exception):
"""Error connecting to host via SSH."""
pass
class DockerNotAvailableError(Exception):
"""Docker is not available on the host."""
pass
class DockerService:
"""Service for collecting Docker information via SSH."""
def __init__(self):
self.ssh_key_path = settings.ssh_key_path
self.ssh_user = settings.ssh_user
self.ssh_remote_user = settings.ssh_remote_user
self.connect_timeout = 5
self.exec_timeout = 15
self._locks: Dict[str, asyncio.Lock] = {}
def _get_lock(self, host_id: str) -> asyncio.Lock:
"""Get or create a lock for a host to prevent concurrent operations."""
if host_id not in self._locks:
self._locks[host_id] = asyncio.Lock()
return self._locks[host_id]
async def _ssh_connect(self, host_ip: str) -> asyncssh.SSHClientConnection:
"""Establish SSH connection to a host."""
if asyncssh is None:
err = None
try:
err = str(_asyncssh_import_error) if _asyncssh_import_error else None
except Exception:
err = None
raise SSHConnectionError(
"Missing dependency: asyncssh. Install it to enable Docker SSH collection. "
f"(python={sys.executable}, import_error={err})"
)
last_err: Optional[BaseException] = None
tried_users: List[str] = []
for username in [self.ssh_remote_user, self.ssh_user]:
if not username or username in tried_users:
continue
tried_users.append(username)
try:
conn = await asyncio.wait_for(
asyncssh.connect(
host_ip,
username=username,
client_keys=[self.ssh_key_path],
known_hosts=None, # Accept any host key (homelab environment)
),
timeout=self.connect_timeout
)
return conn
except asyncio.TimeoutError as e:
last_err = e
continue
except asyncssh.Error as e:
last_err = e
continue
except Exception as e:
last_err = e
continue
if isinstance(last_err, asyncio.TimeoutError):
raise SSHConnectionError(f"SSH connection timeout to {host_ip} (users tried: {', '.join(tried_users)})")
raise SSHConnectionError(
f"SSH error connecting to {host_ip} (users tried: {', '.join(tried_users)}): {last_err}"
)
async def _ssh_exec(
self,
conn: asyncssh.SSHClientConnection,
command: str,
timeout: Optional[int] = None
) -> Tuple[str, str, int]:
"""Execute a command via SSH and return stdout, stderr, exit_code."""
timeout = timeout or self.exec_timeout
try:
result = await asyncio.wait_for(
conn.run(command, check=False),
timeout=timeout
)
return result.stdout or "", result.stderr or "", result.exit_status
except asyncio.TimeoutError:
raise SSHConnectionError(f"Command timeout: {command[:50]}...")
except Exception as e:
raise SSHConnectionError(f"Command execution error: {e}")
async def _parse_docker_json_lines(self, output: str) -> List[Dict[str, Any]]:
"""Parse Docker JSON output (one JSON object per line)."""
results = []
for line in output.strip().split('\n'):
line = line.strip()
if line:
try:
results.append(json.loads(line))
except json.JSONDecodeError:
continue
return results
async def collect_docker_host(self, host_id: str) -> Dict[str, Any]:
"""Collect Docker information from a single host.
Returns:
Dict with collection results including success status, counts, etc.
"""
start_time = time.time()
result = {
"success": False,
"host_id": host_id,
"host_name": "",
"message": "",
"docker_version": None,
"containers_count": 0,
"images_count": 0,
"volumes_count": 0,
"error": None,
"duration_ms": 0
}
async with self._get_lock(host_id):
async with async_session_maker() as session:
try:
# Get host info
host_repo = HostRepository(session)
host = await host_repo.get(host_id)
if not host:
result["error"] = "Host not found"
result["message"] = "Host not found"
return result
result["host_name"] = host.name
host_ip = host.ip_address
print(f"{LOG_PREFIX} Starting collection for host {host.name} ({host_id[:8]}...) at {host_ip}")
# Connect via SSH
conn = await self._ssh_connect(host_ip)
try:
use_sudo: Optional[bool] = None
async def run_docker(cmd: str) -> Tuple[str, str, int]:
nonlocal use_sudo
def _is_sudo_unavailable(stderr: str) -> bool:
s = (stderr or "").lower()
return any(
token in s
for token in (
"sudo: a password is required",
"sudo: no tty present",
"not in the sudoers",
"sudo: permission denied",
"sudo: command not found",
)
)
# If we've already determined sudo behavior, reuse it.
if use_sudo is True:
return await self._ssh_exec(conn, f"sudo -n {cmd}")
if use_sudo is False:
return await self._ssh_exec(conn, cmd)
# First call: prefer sudo -n (passwordless sudo) to avoid docker.sock permission issues.
sudo_out, sudo_err, sudo_code = await self._ssh_exec(conn, f"sudo -n {cmd}")
if sudo_code == 0:
use_sudo = True
return sudo_out, sudo_err, sudo_code
# If sudo is not available, fall back to non-sudo.
if _is_sudo_unavailable(sudo_err):
use_sudo = False
return await self._ssh_exec(conn, cmd)
# Sudo ran but command failed; try non-sudo as a fallback (e.g., rootless docker).
plain_out, plain_err, plain_code = await self._ssh_exec(conn, cmd)
if plain_code == 0:
use_sudo = False
return plain_out, plain_err, plain_code
# Keep sudo as preferred for subsequent commands (best chance to avoid docker.sock issues)
use_sudo = True
merged_err = (plain_err or "").strip()
if sudo_err:
merged_err = (merged_err + "\n" if merged_err else "") + f"sudo_err: {(sudo_err or '').strip()}"
return plain_out, merged_err, plain_code
# Get Docker version
base_version_cmd = "docker version --format '{{.Server.Version}}'"
version_out, version_err, version_code = await run_docker(base_version_cmd)
if version_code != 0:
err = (version_err or "").strip()
if "permission denied" in err.lower() and "docker.sock" in err.lower():
raise DockerNotAvailableError(
"Docker not available: permission denied while trying to connect to the Docker daemon socket. "
"Fix one of: set SSH_REMOTE_USER=root, add the SSH user to the 'docker' group, "
"or allow passwordless sudo for docker (sudo -n). "
f"Raw error: {err}"
)
raise DockerNotAvailableError(f"Docker not available: {err}")
docker_version = version_out.strip()
result["docker_version"] = docker_version
print(f"{LOG_PREFIX} {host.name}: Docker version {docker_version}, using sudo={use_sudo}")
# Collect containers
containers_out, _, _ = await run_docker(
"docker ps -a --format '{{json .}}' --no-trunc"
)
containers_data = await self._parse_docker_json_lines(containers_out)
print(f"{LOG_PREFIX} {host.name}: Found {len(containers_data)} container(s)")
# Collect images
images_out, _, _ = await run_docker(
"docker images --format '{{json .}}'"
)
images_data = await self._parse_docker_json_lines(images_out)
print(f"{LOG_PREFIX} {host.name}: Found {len(images_data)} image(s)")
# Collect volumes
volumes_out, _, _ = await run_docker(
"docker volume ls --format '{{json .}}'"
)
volumes_data = await self._parse_docker_json_lines(volumes_out)
print(f"{LOG_PREFIX} {host.name}: Found {len(volumes_data)} volume(s)")
finally:
conn.close()
# Store in database
container_repo = DockerContainerRepository(session)
image_repo = DockerImageRepository(session)
volume_repo = DockerVolumeRepository(session)
# Process containers
container_ids = []
for c in containers_data:
container_id = c.get("ID", "")
container_ids.append(container_id)
# Parse created time
created_at = None
created_str = c.get("CreatedAt", "")
if created_str:
try:
# Docker format: "2024-01-15 10:30:00 -0500 EST"
created_at = datetime.fromisoformat(
created_str.split(" ")[0] + "T" + created_str.split(" ")[1]
)
except (ValueError, IndexError):
pass
# Extract compose project from labels
compose_project = None
labels = c.get("Labels", "")
if labels:
for label in labels.split(","):
if "com.docker.compose.project=" in label:
compose_project = label.split("=", 1)[1]
break
# Parse labels into dict
labels_dict = {}
if labels:
for label in labels.split(","):
if "=" in label:
k, v = label.split("=", 1)
labels_dict[k] = v
# Parse ports
ports_str = c.get("Ports", "")
ports = {"raw": ports_str} if ports_str else None
# Determine health from status
status = c.get("Status", "")
health = None
if "(healthy)" in status.lower():
health = "healthy"
elif "(unhealthy)" in status.lower():
health = "unhealthy"
elif "(health: starting)" in status.lower():
health = "starting"
await container_repo.upsert(
host_id=host_id,
container_id=container_id,
name=c.get("Names", "").lstrip("/"),
image=c.get("Image", ""),
state=c.get("State", "unknown").lower(),
status=status,
health=health,
created_at=created_at,
ports=ports,
labels=labels_dict if labels_dict else None,
compose_project=compose_project
)
# Remove stale containers
await container_repo.delete_stale(host_id, container_ids)
result["containers_count"] = len(containers_data)
# Process images
image_ids = []
for img in images_data:
image_id = img.get("ID", "")
image_ids.append(image_id)
# Parse size (e.g., "150MB" -> bytes)
size_str = img.get("Size", "0")
size = self._parse_size(size_str)
# Parse created time
created = None
created_str = img.get("CreatedAt", "")
if created_str:
try:
created = datetime.fromisoformat(
created_str.split(" ")[0] + "T" + created_str.split(" ")[1]
)
except (ValueError, IndexError):
pass
# Get repo tags
repo = img.get("Repository", "<none>")
tag = img.get("Tag", "<none>")
repo_tags = None
if repo != "<none>":
repo_tags = [f"{repo}:{tag}"]
await image_repo.upsert(
host_id=host_id,
image_id=image_id,
repo_tags=repo_tags,
size=size,
created=created
)
await image_repo.delete_stale(host_id, image_ids)
result["images_count"] = len(images_data)
# Process volumes
volume_names = []
for vol in volumes_data:
name = vol.get("Name", "")
volume_names.append(name)
await volume_repo.upsert(
host_id=host_id,
name=name,
driver=vol.get("Driver", "local"),
mountpoint=vol.get("Mountpoint", ""),
scope=vol.get("Scope", "local")
)
await volume_repo.delete_stale(host_id, volume_names)
result["volumes_count"] = len(volumes_data)
# Update host Docker status
await host_repo.update(
host,
docker_version=docker_version,
docker_status="online",
docker_last_collect_at=datetime.now(timezone.utc)
)
await session.commit()
result["success"] = True
result["message"] = "Collection completed"
duration_ms = int((time.time() - start_time) * 1000)
print(f"{LOG_PREFIX} {host.name}: Collection completed in {duration_ms}ms - {len(containers_data)} containers, {len(images_data)} images, {len(volumes_data)} volumes")
except SSHConnectionError as e:
result["error"] = str(e)
result["message"] = "Collection failed"
# Update host status to offline
await host_repo.update(host, docker_status="offline")
await session.commit()
print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: SSH connection error - {e}")
logger.warning(f"{LOG_PREFIX} SSH error collecting Docker from {host_id}: {e}")
except DockerNotAvailableError as e:
result["error"] = str(e)
result["message"] = "Collection failed"
await host_repo.update(host, docker_status="error")
await session.commit()
print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: Docker not available - {e}")
logger.warning(f"{LOG_PREFIX} Docker not available on {host_id}: {e}")
except Exception as e:
result["error"] = str(e)
result["message"] = "Collection failed"
import traceback
tb = traceback.format_exc()
print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: Collection error - {e}")
logger.error(f"{LOG_PREFIX} Error collecting Docker from {host_id}: {e}\n{tb}")
await session.rollback()
result["duration_ms"] = int((time.time() - start_time) * 1000)
if not result.get("message"):
result["message"] = "Collection completed" if result.get("success") else "Collection failed"
return result
async def collect_all_hosts(self) -> Dict[str, Any]:
"""Collect Docker information from all enabled hosts.
Returns:
Summary of collection results
"""
results = {
"success": True,
"total_hosts": 0,
"successful": 0,
"failed": 0,
"results": []
}
async with async_session_maker() as session:
host_repo = HostRepository(session)
hosts = await host_repo.list_docker_enabled()
results["total_hosts"] = len(hosts)
# Collect from all hosts concurrently (with limit)
semaphore = asyncio.Semaphore(5) # Max 5 concurrent collections
async def collect_with_semaphore(host):
async with semaphore:
return await self.collect_docker_host(host.id)
if hosts:
host_results = await asyncio.gather(
*[collect_with_semaphore(host) for host in hosts],
return_exceptions=True
)
for r in host_results:
if isinstance(r, Exception):
results["failed"] += 1
results["results"].append({
"success": False,
"host_id": "",
"host_name": "",
"message": "Collection failed",
"docker_version": None,
"containers_count": 0,
"images_count": 0,
"volumes_count": 0,
"duration_ms": 0,
"error": str(r)
})
elif r.get("success"):
results["successful"] += 1
results["results"].append(r)
else:
results["failed"] += 1
results["results"].append(r)
results["success"] = results["failed"] == 0
return results
def _parse_size(self, size_str: str) -> int:
"""Parse Docker size string to bytes."""
if not size_str:
return 0
size_str = size_str.upper().strip()
multipliers = {
"TB": 1024 ** 4,
"GB": 1024 ** 3,
"MB": 1024 ** 2,
"KB": 1024,
"B": 1,
}
# Important: match longer suffixes first, otherwise 'B' would match 'MB'
for suffix in ("TB", "GB", "MB", "KB", "B"):
mult = multipliers[suffix]
if size_str.endswith(suffix):
try:
num = float(size_str[:-len(suffix)])
return int(num * mult)
except ValueError:
return 0
try:
return int(size_str)
except ValueError:
return 0
async def get_docker_hosts(self) -> List[Dict[str, Any]]:
"""Get all hosts with Docker info."""
async with async_session_maker() as session:
host_repo = HostRepository(session)
container_repo = DockerContainerRepository(session)
image_repo = DockerImageRepository(session)
volume_repo = DockerVolumeRepository(session)
alert_repo = DockerAlertRepository(session)
# Best-effort: read inventory and only show hosts in role_docker group.
# Visibility is controlled by inventory membership; docker_enabled remains a user toggle.
docker_host_names: Optional[set[str]] = None
try:
inv_hosts = ansible_service.get_hosts_from_inventory()
docker_host_names = {
h.name
for h in inv_hosts
if "role_docker" in (getattr(h, "groups", None) or [])
}
except Exception:
# Inventory read is best-effort; do not break Docker UI
docker_host_names = None
# Return all active hosts so the UI can show both enabled and disabled
# Docker monitoring states (pause/play).
hosts = await host_repo.list(limit=1000)
hosts = [h for h in hosts if not h.deleted_at]
if docker_host_names is not None:
hosts = [h for h in hosts if h.name in docker_host_names]
hosts = sorted(hosts, key=lambda h: (h.name or ""))
result = []
for host in hosts:
if host.deleted_at:
continue
containers = await container_repo.count_by_host(host.id)
images = await image_repo.count_by_host(host.id)
volumes = await volume_repo.count_by_host(host.id)
open_alerts = await alert_repo.count_open_by_host(host.id)
result.append({
"host_id": host.id,
"host_name": host.name,
"host_ip": host.ip_address,
"docker_enabled": host.docker_enabled,
"docker_version": host.docker_version,
"docker_status": host.docker_status,
"docker_last_collect_at": host.docker_last_collect_at,
"containers_total": containers.get("total", 0),
"containers_running": containers.get("running", 0),
"images_total": images.get("total", 0),
"volumes_total": volumes,
"open_alerts": open_alerts
})
return result
async def enable_docker_monitoring(self, host_id: str, enabled: bool = True) -> bool:
"""Enable or disable Docker monitoring on a host."""
async with async_session_maker() as session:
host_repo = HostRepository(session)
host = await host_repo.get(host_id)
if not host:
return False
await host_repo.update(host, docker_enabled=enabled)
await session.commit()
# If enabling, trigger an immediate collection
if enabled:
asyncio.create_task(self.collect_docker_host(host_id))
return True
# Singleton instance
docker_service = DockerService()