Some checks failed
Tests / Backend Tests (Python) (3.10) (push) Has been cancelled
Tests / Backend Tests (Python) (3.11) (push) Has been cancelled
Tests / Backend Tests (Python) (3.12) (push) Has been cancelled
Tests / Frontend Tests (JS) (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / All Tests Passed (push) Has been cancelled
611 lines
26 KiB
Python
611 lines
26 KiB
Python
"""Docker collection service using SSH.
|
|
|
|
Collects Docker information from hosts via SSH commands.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
|
|
try:
|
|
import asyncssh
|
|
_asyncssh_import_error: Optional[BaseException] = None
|
|
except ModuleNotFoundError as e: # pragma: no cover
|
|
asyncssh = None
|
|
_asyncssh_import_error = e
|
|
|
|
from app.core.config import settings
|
|
from app.models.database import async_session_maker
|
|
from app.crud.host import HostRepository
|
|
from app.crud.docker_container import DockerContainerRepository
|
|
from app.crud.docker_image import DockerImageRepository
|
|
from app.crud.docker_volume import DockerVolumeRepository
|
|
from app.crud.docker_alert import DockerAlertRepository
|
|
from app.services.ansible_service import ansible_service
|
|
|
|
logger = logging.getLogger("homelab.docker")
|
|
|
|
# Log prefix for standardized output
|
|
LOG_PREFIX = "[DOCKER]"
|
|
|
|
|
|
class SSHConnectionError(Exception):
|
|
"""Error connecting to host via SSH."""
|
|
pass
|
|
|
|
|
|
class DockerNotAvailableError(Exception):
|
|
"""Docker is not available on the host."""
|
|
pass
|
|
|
|
|
|
class DockerService:
|
|
"""Service for collecting Docker information via SSH."""
|
|
|
|
def __init__(self):
|
|
self.ssh_key_path = settings.ssh_key_path
|
|
self.ssh_user = settings.ssh_user
|
|
self.ssh_remote_user = settings.ssh_remote_user
|
|
self.connect_timeout = 5
|
|
self.exec_timeout = 15
|
|
self._locks: Dict[str, asyncio.Lock] = {}
|
|
|
|
def _get_lock(self, host_id: str) -> asyncio.Lock:
|
|
"""Get or create a lock for a host to prevent concurrent operations."""
|
|
if host_id not in self._locks:
|
|
self._locks[host_id] = asyncio.Lock()
|
|
return self._locks[host_id]
|
|
|
|
async def _ssh_connect(self, host_ip: str) -> asyncssh.SSHClientConnection:
|
|
"""Establish SSH connection to a host."""
|
|
if asyncssh is None:
|
|
err = None
|
|
try:
|
|
err = str(_asyncssh_import_error) if _asyncssh_import_error else None
|
|
except Exception:
|
|
err = None
|
|
raise SSHConnectionError(
|
|
"Missing dependency: asyncssh. Install it to enable Docker SSH collection. "
|
|
f"(python={sys.executable}, import_error={err})"
|
|
)
|
|
last_err: Optional[BaseException] = None
|
|
tried_users: List[str] = []
|
|
for username in [self.ssh_remote_user, self.ssh_user]:
|
|
if not username or username in tried_users:
|
|
continue
|
|
tried_users.append(username)
|
|
try:
|
|
conn = await asyncio.wait_for(
|
|
asyncssh.connect(
|
|
host_ip,
|
|
username=username,
|
|
client_keys=[self.ssh_key_path],
|
|
known_hosts=None, # Accept any host key (homelab environment)
|
|
),
|
|
timeout=self.connect_timeout
|
|
)
|
|
return conn
|
|
except asyncio.TimeoutError as e:
|
|
last_err = e
|
|
continue
|
|
except asyncssh.Error as e:
|
|
last_err = e
|
|
continue
|
|
except Exception as e:
|
|
last_err = e
|
|
continue
|
|
|
|
if isinstance(last_err, asyncio.TimeoutError):
|
|
raise SSHConnectionError(f"SSH connection timeout to {host_ip} (users tried: {', '.join(tried_users)})")
|
|
raise SSHConnectionError(
|
|
f"SSH error connecting to {host_ip} (users tried: {', '.join(tried_users)}): {last_err}"
|
|
)
|
|
|
|
async def _ssh_exec(
|
|
self,
|
|
conn: asyncssh.SSHClientConnection,
|
|
command: str,
|
|
timeout: Optional[int] = None
|
|
) -> Tuple[str, str, int]:
|
|
"""Execute a command via SSH and return stdout, stderr, exit_code."""
|
|
timeout = timeout or self.exec_timeout
|
|
try:
|
|
result = await asyncio.wait_for(
|
|
conn.run(command, check=False),
|
|
timeout=timeout
|
|
)
|
|
return result.stdout or "", result.stderr or "", result.exit_status
|
|
except asyncio.TimeoutError:
|
|
raise SSHConnectionError(f"Command timeout: {command[:50]}...")
|
|
except Exception as e:
|
|
raise SSHConnectionError(f"Command execution error: {e}")
|
|
|
|
async def _parse_docker_json_lines(self, output: str) -> List[Dict[str, Any]]:
|
|
"""Parse Docker JSON output (one JSON object per line)."""
|
|
results = []
|
|
for line in output.strip().split('\n'):
|
|
line = line.strip()
|
|
if line:
|
|
try:
|
|
results.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
continue
|
|
return results
|
|
|
|
async def collect_docker_host(self, host_id: str) -> Dict[str, Any]:
|
|
"""Collect Docker information from a single host.
|
|
|
|
Returns:
|
|
Dict with collection results including success status, counts, etc.
|
|
"""
|
|
start_time = time.time()
|
|
result = {
|
|
"success": False,
|
|
"host_id": host_id,
|
|
"host_name": "",
|
|
"message": "",
|
|
"docker_version": None,
|
|
"containers_count": 0,
|
|
"images_count": 0,
|
|
"volumes_count": 0,
|
|
"error": None,
|
|
"duration_ms": 0
|
|
}
|
|
|
|
async with self._get_lock(host_id):
|
|
async with async_session_maker() as session:
|
|
try:
|
|
# Get host info
|
|
host_repo = HostRepository(session)
|
|
host = await host_repo.get(host_id)
|
|
|
|
if not host:
|
|
result["error"] = "Host not found"
|
|
result["message"] = "Host not found"
|
|
return result
|
|
|
|
result["host_name"] = host.name
|
|
host_ip = host.ip_address
|
|
|
|
print(f"{LOG_PREFIX} Starting collection for host {host.name} ({host_id[:8]}...) at {host_ip}")
|
|
|
|
# Connect via SSH
|
|
conn = await self._ssh_connect(host_ip)
|
|
|
|
try:
|
|
use_sudo: Optional[bool] = None
|
|
|
|
async def run_docker(cmd: str) -> Tuple[str, str, int]:
|
|
nonlocal use_sudo
|
|
|
|
def _is_sudo_unavailable(stderr: str) -> bool:
|
|
s = (stderr or "").lower()
|
|
return any(
|
|
token in s
|
|
for token in (
|
|
"sudo: a password is required",
|
|
"sudo: no tty present",
|
|
"not in the sudoers",
|
|
"sudo: permission denied",
|
|
"sudo: command not found",
|
|
)
|
|
)
|
|
|
|
# If we've already determined sudo behavior, reuse it.
|
|
if use_sudo is True:
|
|
return await self._ssh_exec(conn, f"sudo -n {cmd}")
|
|
if use_sudo is False:
|
|
return await self._ssh_exec(conn, cmd)
|
|
|
|
# First call: prefer sudo -n (passwordless sudo) to avoid docker.sock permission issues.
|
|
sudo_out, sudo_err, sudo_code = await self._ssh_exec(conn, f"sudo -n {cmd}")
|
|
if sudo_code == 0:
|
|
use_sudo = True
|
|
return sudo_out, sudo_err, sudo_code
|
|
|
|
# If sudo is not available, fall back to non-sudo.
|
|
if _is_sudo_unavailable(sudo_err):
|
|
use_sudo = False
|
|
return await self._ssh_exec(conn, cmd)
|
|
|
|
# Sudo ran but command failed; try non-sudo as a fallback (e.g., rootless docker).
|
|
plain_out, plain_err, plain_code = await self._ssh_exec(conn, cmd)
|
|
if plain_code == 0:
|
|
use_sudo = False
|
|
return plain_out, plain_err, plain_code
|
|
|
|
# Keep sudo as preferred for subsequent commands (best chance to avoid docker.sock issues)
|
|
use_sudo = True
|
|
merged_err = (plain_err or "").strip()
|
|
if sudo_err:
|
|
merged_err = (merged_err + "\n" if merged_err else "") + f"sudo_err: {(sudo_err or '').strip()}"
|
|
return plain_out, merged_err, plain_code
|
|
|
|
# Get Docker version
|
|
base_version_cmd = "docker version --format '{{.Server.Version}}'"
|
|
version_out, version_err, version_code = await run_docker(base_version_cmd)
|
|
|
|
if version_code != 0:
|
|
err = (version_err or "").strip()
|
|
if "permission denied" in err.lower() and "docker.sock" in err.lower():
|
|
raise DockerNotAvailableError(
|
|
"Docker not available: permission denied while trying to connect to the Docker daemon socket. "
|
|
"Fix one of: set SSH_REMOTE_USER=root, add the SSH user to the 'docker' group, "
|
|
"or allow passwordless sudo for docker (sudo -n). "
|
|
f"Raw error: {err}"
|
|
)
|
|
raise DockerNotAvailableError(f"Docker not available: {err}")
|
|
|
|
docker_version = version_out.strip()
|
|
result["docker_version"] = docker_version
|
|
print(f"{LOG_PREFIX} {host.name}: Docker version {docker_version}, using sudo={use_sudo}")
|
|
|
|
# Collect containers
|
|
containers_out, _, _ = await run_docker(
|
|
"docker ps -a --format '{{json .}}' --no-trunc"
|
|
)
|
|
containers_data = await self._parse_docker_json_lines(containers_out)
|
|
print(f"{LOG_PREFIX} {host.name}: Found {len(containers_data)} container(s)")
|
|
|
|
# Collect images
|
|
images_out, _, _ = await run_docker(
|
|
"docker images --format '{{json .}}'"
|
|
)
|
|
images_data = await self._parse_docker_json_lines(images_out)
|
|
print(f"{LOG_PREFIX} {host.name}: Found {len(images_data)} image(s)")
|
|
|
|
# Collect volumes
|
|
volumes_out, _, _ = await run_docker(
|
|
"docker volume ls --format '{{json .}}'"
|
|
)
|
|
volumes_data = await self._parse_docker_json_lines(volumes_out)
|
|
print(f"{LOG_PREFIX} {host.name}: Found {len(volumes_data)} volume(s)")
|
|
|
|
finally:
|
|
conn.close()
|
|
|
|
# Store in database
|
|
container_repo = DockerContainerRepository(session)
|
|
image_repo = DockerImageRepository(session)
|
|
volume_repo = DockerVolumeRepository(session)
|
|
|
|
# Process containers
|
|
container_ids = []
|
|
for c in containers_data:
|
|
container_id = c.get("ID", "")
|
|
container_ids.append(container_id)
|
|
|
|
# Parse created time
|
|
created_at = None
|
|
created_str = c.get("CreatedAt", "")
|
|
if created_str:
|
|
try:
|
|
# Docker format: "2024-01-15 10:30:00 -0500 EST"
|
|
created_at = datetime.fromisoformat(
|
|
created_str.split(" ")[0] + "T" + created_str.split(" ")[1]
|
|
)
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
# Extract compose project from labels
|
|
compose_project = None
|
|
labels = c.get("Labels", "")
|
|
if labels:
|
|
for label in labels.split(","):
|
|
if "com.docker.compose.project=" in label:
|
|
compose_project = label.split("=", 1)[1]
|
|
break
|
|
|
|
# Parse labels into dict
|
|
labels_dict = {}
|
|
if labels:
|
|
for label in labels.split(","):
|
|
if "=" in label:
|
|
k, v = label.split("=", 1)
|
|
labels_dict[k] = v
|
|
|
|
# Parse ports
|
|
ports_str = c.get("Ports", "")
|
|
ports = {"raw": ports_str} if ports_str else None
|
|
|
|
# Determine health from status
|
|
status = c.get("Status", "")
|
|
health = None
|
|
if "(healthy)" in status.lower():
|
|
health = "healthy"
|
|
elif "(unhealthy)" in status.lower():
|
|
health = "unhealthy"
|
|
elif "(health: starting)" in status.lower():
|
|
health = "starting"
|
|
|
|
await container_repo.upsert(
|
|
host_id=host_id,
|
|
container_id=container_id,
|
|
name=c.get("Names", "").lstrip("/"),
|
|
image=c.get("Image", ""),
|
|
state=c.get("State", "unknown").lower(),
|
|
status=status,
|
|
health=health,
|
|
created_at=created_at,
|
|
ports=ports,
|
|
labels=labels_dict if labels_dict else None,
|
|
compose_project=compose_project
|
|
)
|
|
|
|
# Remove stale containers
|
|
await container_repo.delete_stale(host_id, container_ids)
|
|
result["containers_count"] = len(containers_data)
|
|
|
|
# Process images
|
|
image_ids = []
|
|
for img in images_data:
|
|
image_id = img.get("ID", "")
|
|
image_ids.append(image_id)
|
|
|
|
# Parse size (e.g., "150MB" -> bytes)
|
|
size_str = img.get("Size", "0")
|
|
size = self._parse_size(size_str)
|
|
|
|
# Parse created time
|
|
created = None
|
|
created_str = img.get("CreatedAt", "")
|
|
if created_str:
|
|
try:
|
|
created = datetime.fromisoformat(
|
|
created_str.split(" ")[0] + "T" + created_str.split(" ")[1]
|
|
)
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
# Get repo tags
|
|
repo = img.get("Repository", "<none>")
|
|
tag = img.get("Tag", "<none>")
|
|
repo_tags = None
|
|
if repo != "<none>":
|
|
repo_tags = [f"{repo}:{tag}"]
|
|
|
|
await image_repo.upsert(
|
|
host_id=host_id,
|
|
image_id=image_id,
|
|
repo_tags=repo_tags,
|
|
size=size,
|
|
created=created
|
|
)
|
|
|
|
await image_repo.delete_stale(host_id, image_ids)
|
|
result["images_count"] = len(images_data)
|
|
|
|
# Process volumes
|
|
volume_names = []
|
|
for vol in volumes_data:
|
|
name = vol.get("Name", "")
|
|
volume_names.append(name)
|
|
|
|
await volume_repo.upsert(
|
|
host_id=host_id,
|
|
name=name,
|
|
driver=vol.get("Driver", "local"),
|
|
mountpoint=vol.get("Mountpoint", ""),
|
|
scope=vol.get("Scope", "local")
|
|
)
|
|
|
|
await volume_repo.delete_stale(host_id, volume_names)
|
|
result["volumes_count"] = len(volumes_data)
|
|
|
|
# Update host Docker status
|
|
await host_repo.update(
|
|
host,
|
|
docker_version=docker_version,
|
|
docker_status="online",
|
|
docker_last_collect_at=datetime.now(timezone.utc)
|
|
)
|
|
|
|
await session.commit()
|
|
result["success"] = True
|
|
result["message"] = "Collection completed"
|
|
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
print(f"{LOG_PREFIX} {host.name}: Collection completed in {duration_ms}ms - {len(containers_data)} containers, {len(images_data)} images, {len(volumes_data)} volumes")
|
|
|
|
except SSHConnectionError as e:
|
|
result["error"] = str(e)
|
|
result["message"] = "Collection failed"
|
|
# Update host status to offline
|
|
await host_repo.update(host, docker_status="offline")
|
|
await session.commit()
|
|
print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: SSH connection error - {e}")
|
|
logger.warning(f"{LOG_PREFIX} SSH error collecting Docker from {host_id}: {e}")
|
|
|
|
except DockerNotAvailableError as e:
|
|
result["error"] = str(e)
|
|
result["message"] = "Collection failed"
|
|
await host_repo.update(host, docker_status="error")
|
|
await session.commit()
|
|
print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: Docker not available - {e}")
|
|
logger.warning(f"{LOG_PREFIX} Docker not available on {host_id}: {e}")
|
|
|
|
except Exception as e:
|
|
result["error"] = str(e)
|
|
result["message"] = "Collection failed"
|
|
import traceback
|
|
tb = traceback.format_exc()
|
|
print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: Collection error - {e}")
|
|
logger.error(f"{LOG_PREFIX} Error collecting Docker from {host_id}: {e}\n{tb}")
|
|
await session.rollback()
|
|
|
|
result["duration_ms"] = int((time.time() - start_time) * 1000)
|
|
if not result.get("message"):
|
|
result["message"] = "Collection completed" if result.get("success") else "Collection failed"
|
|
return result
|
|
|
|
async def collect_all_hosts(self) -> Dict[str, Any]:
|
|
"""Collect Docker information from all enabled hosts.
|
|
|
|
Returns:
|
|
Summary of collection results
|
|
"""
|
|
results = {
|
|
"success": True,
|
|
"total_hosts": 0,
|
|
"successful": 0,
|
|
"failed": 0,
|
|
"results": []
|
|
}
|
|
|
|
async with async_session_maker() as session:
|
|
host_repo = HostRepository(session)
|
|
hosts = await host_repo.list_docker_enabled()
|
|
results["total_hosts"] = len(hosts)
|
|
|
|
# Collect from all hosts concurrently (with limit)
|
|
semaphore = asyncio.Semaphore(5) # Max 5 concurrent collections
|
|
|
|
async def collect_with_semaphore(host):
|
|
async with semaphore:
|
|
return await self.collect_docker_host(host.id)
|
|
|
|
if hosts:
|
|
host_results = await asyncio.gather(
|
|
*[collect_with_semaphore(host) for host in hosts],
|
|
return_exceptions=True
|
|
)
|
|
|
|
for r in host_results:
|
|
if isinstance(r, Exception):
|
|
results["failed"] += 1
|
|
results["results"].append({
|
|
"success": False,
|
|
"host_id": "",
|
|
"host_name": "",
|
|
"message": "Collection failed",
|
|
"docker_version": None,
|
|
"containers_count": 0,
|
|
"images_count": 0,
|
|
"volumes_count": 0,
|
|
"duration_ms": 0,
|
|
"error": str(r)
|
|
})
|
|
elif r.get("success"):
|
|
results["successful"] += 1
|
|
results["results"].append(r)
|
|
else:
|
|
results["failed"] += 1
|
|
results["results"].append(r)
|
|
|
|
results["success"] = results["failed"] == 0
|
|
return results
|
|
|
|
def _parse_size(self, size_str: str) -> int:
|
|
"""Parse Docker size string to bytes."""
|
|
if not size_str:
|
|
return 0
|
|
|
|
size_str = size_str.upper().strip()
|
|
multipliers = {
|
|
"TB": 1024 ** 4,
|
|
"GB": 1024 ** 3,
|
|
"MB": 1024 ** 2,
|
|
"KB": 1024,
|
|
"B": 1,
|
|
}
|
|
|
|
# Important: match longer suffixes first, otherwise 'B' would match 'MB'
|
|
for suffix in ("TB", "GB", "MB", "KB", "B"):
|
|
mult = multipliers[suffix]
|
|
if size_str.endswith(suffix):
|
|
try:
|
|
num = float(size_str[:-len(suffix)])
|
|
return int(num * mult)
|
|
except ValueError:
|
|
return 0
|
|
|
|
try:
|
|
return int(size_str)
|
|
except ValueError:
|
|
return 0
|
|
|
|
async def get_docker_hosts(self) -> List[Dict[str, Any]]:
|
|
"""Get all hosts with Docker info."""
|
|
async with async_session_maker() as session:
|
|
host_repo = HostRepository(session)
|
|
container_repo = DockerContainerRepository(session)
|
|
image_repo = DockerImageRepository(session)
|
|
volume_repo = DockerVolumeRepository(session)
|
|
alert_repo = DockerAlertRepository(session)
|
|
|
|
# Best-effort: read inventory and only show hosts in role_docker group.
|
|
# Visibility is controlled by inventory membership; docker_enabled remains a user toggle.
|
|
docker_host_names: Optional[set[str]] = None
|
|
try:
|
|
inv_hosts = ansible_service.get_hosts_from_inventory()
|
|
docker_host_names = {
|
|
h.name
|
|
for h in inv_hosts
|
|
if "role_docker" in (getattr(h, "groups", None) or [])
|
|
}
|
|
except Exception:
|
|
# Inventory read is best-effort; do not break Docker UI
|
|
docker_host_names = None
|
|
|
|
# Return all active hosts so the UI can show both enabled and disabled
|
|
# Docker monitoring states (pause/play).
|
|
hosts = await host_repo.list(limit=1000)
|
|
hosts = [h for h in hosts if not h.deleted_at]
|
|
if docker_host_names is not None:
|
|
hosts = [h for h in hosts if h.name in docker_host_names]
|
|
hosts = sorted(hosts, key=lambda h: (h.name or ""))
|
|
result = []
|
|
|
|
for host in hosts:
|
|
if host.deleted_at:
|
|
continue
|
|
|
|
containers = await container_repo.count_by_host(host.id)
|
|
images = await image_repo.count_by_host(host.id)
|
|
volumes = await volume_repo.count_by_host(host.id)
|
|
open_alerts = await alert_repo.count_open_by_host(host.id)
|
|
|
|
result.append({
|
|
"host_id": host.id,
|
|
"host_name": host.name,
|
|
"host_ip": host.ip_address,
|
|
"docker_enabled": host.docker_enabled,
|
|
"docker_version": host.docker_version,
|
|
"docker_status": host.docker_status,
|
|
"docker_last_collect_at": host.docker_last_collect_at,
|
|
"containers_total": containers.get("total", 0),
|
|
"containers_running": containers.get("running", 0),
|
|
"images_total": images.get("total", 0),
|
|
"volumes_total": volumes,
|
|
"open_alerts": open_alerts
|
|
})
|
|
|
|
return result
|
|
|
|
async def enable_docker_monitoring(self, host_id: str, enabled: bool = True) -> bool:
|
|
"""Enable or disable Docker monitoring on a host."""
|
|
async with async_session_maker() as session:
|
|
host_repo = HostRepository(session)
|
|
host = await host_repo.get(host_id)
|
|
|
|
if not host:
|
|
return False
|
|
|
|
await host_repo.update(host, docker_enabled=enabled)
|
|
await session.commit()
|
|
|
|
# If enabling, trigger an immediate collection
|
|
if enabled:
|
|
asyncio.create_task(self.collect_docker_host(host_id))
|
|
|
|
return True
|
|
|
|
|
|
# Singleton instance
|
|
docker_service = DockerService()
|