"""Docker collection service using SSH. Collects Docker information from hosts via SSH commands. """ from __future__ import annotations import asyncio import json import logging import sys import time from datetime import datetime, timezone from typing import Dict, List, Optional, Any, Tuple try: import asyncssh _asyncssh_import_error: Optional[BaseException] = None except ModuleNotFoundError as e: # pragma: no cover asyncssh = None _asyncssh_import_error = e from app.core.config import settings from app.models.database import async_session_maker from app.crud.host import HostRepository from app.crud.docker_container import DockerContainerRepository from app.crud.docker_image import DockerImageRepository from app.crud.docker_volume import DockerVolumeRepository from app.crud.docker_alert import DockerAlertRepository from app.services.ansible_service import ansible_service logger = logging.getLogger("homelab.docker") # Log prefix for standardized output LOG_PREFIX = "[DOCKER]" class SSHConnectionError(Exception): """Error connecting to host via SSH.""" pass class DockerNotAvailableError(Exception): """Docker is not available on the host.""" pass class DockerService: """Service for collecting Docker information via SSH.""" def __init__(self): self.ssh_key_path = settings.ssh_key_path self.ssh_user = settings.ssh_user self.ssh_remote_user = settings.ssh_remote_user self.connect_timeout = 5 self.exec_timeout = 15 self._locks: Dict[str, asyncio.Lock] = {} def _get_lock(self, host_id: str) -> asyncio.Lock: """Get or create a lock for a host to prevent concurrent operations.""" if host_id not in self._locks: self._locks[host_id] = asyncio.Lock() return self._locks[host_id] async def _ssh_connect(self, host_ip: str) -> asyncssh.SSHClientConnection: """Establish SSH connection to a host.""" if asyncssh is None: err = None try: err = str(_asyncssh_import_error) if _asyncssh_import_error else None except Exception: err = None raise SSHConnectionError( "Missing dependency: asyncssh. Install it to enable Docker SSH collection. " f"(python={sys.executable}, import_error={err})" ) last_err: Optional[BaseException] = None tried_users: List[str] = [] for username in [self.ssh_remote_user, self.ssh_user]: if not username or username in tried_users: continue tried_users.append(username) try: conn = await asyncio.wait_for( asyncssh.connect( host_ip, username=username, client_keys=[self.ssh_key_path], known_hosts=None, # Accept any host key (homelab environment) ), timeout=self.connect_timeout ) return conn except asyncio.TimeoutError as e: last_err = e continue except asyncssh.Error as e: last_err = e continue except Exception as e: last_err = e continue if isinstance(last_err, asyncio.TimeoutError): raise SSHConnectionError(f"SSH connection timeout to {host_ip} (users tried: {', '.join(tried_users)})") raise SSHConnectionError( f"SSH error connecting to {host_ip} (users tried: {', '.join(tried_users)}): {last_err}" ) async def _ssh_exec( self, conn: asyncssh.SSHClientConnection, command: str, timeout: Optional[int] = None ) -> Tuple[str, str, int]: """Execute a command via SSH and return stdout, stderr, exit_code.""" timeout = timeout or self.exec_timeout try: result = await asyncio.wait_for( conn.run(command, check=False), timeout=timeout ) return result.stdout or "", result.stderr or "", result.exit_status except asyncio.TimeoutError: raise SSHConnectionError(f"Command timeout: {command[:50]}...") except Exception as e: raise SSHConnectionError(f"Command execution error: {e}") async def _parse_docker_json_lines(self, output: str) -> List[Dict[str, Any]]: """Parse Docker JSON output (one JSON object per line).""" results = [] for line in output.strip().split('\n'): line = line.strip() if line: try: results.append(json.loads(line)) except json.JSONDecodeError: continue return results async def collect_docker_host(self, host_id: str) -> Dict[str, Any]: """Collect Docker information from a single host. Returns: Dict with collection results including success status, counts, etc. """ start_time = time.time() result = { "success": False, "host_id": host_id, "host_name": "", "message": "", "docker_version": None, "containers_count": 0, "images_count": 0, "volumes_count": 0, "error": None, "duration_ms": 0 } async with self._get_lock(host_id): async with async_session_maker() as session: try: # Get host info host_repo = HostRepository(session) host = await host_repo.get(host_id) if not host: result["error"] = "Host not found" result["message"] = "Host not found" return result result["host_name"] = host.name host_ip = host.ip_address print(f"{LOG_PREFIX} Starting collection for host {host.name} ({host_id[:8]}...) at {host_ip}") # Connect via SSH conn = await self._ssh_connect(host_ip) try: use_sudo: Optional[bool] = None async def run_docker(cmd: str) -> Tuple[str, str, int]: nonlocal use_sudo def _is_sudo_unavailable(stderr: str) -> bool: s = (stderr or "").lower() return any( token in s for token in ( "sudo: a password is required", "sudo: no tty present", "not in the sudoers", "sudo: permission denied", "sudo: command not found", ) ) # If we've already determined sudo behavior, reuse it. if use_sudo is True: return await self._ssh_exec(conn, f"sudo -n {cmd}") if use_sudo is False: return await self._ssh_exec(conn, cmd) # First call: prefer sudo -n (passwordless sudo) to avoid docker.sock permission issues. sudo_out, sudo_err, sudo_code = await self._ssh_exec(conn, f"sudo -n {cmd}") if sudo_code == 0: use_sudo = True return sudo_out, sudo_err, sudo_code # If sudo is not available, fall back to non-sudo. if _is_sudo_unavailable(sudo_err): use_sudo = False return await self._ssh_exec(conn, cmd) # Sudo ran but command failed; try non-sudo as a fallback (e.g., rootless docker). plain_out, plain_err, plain_code = await self._ssh_exec(conn, cmd) if plain_code == 0: use_sudo = False return plain_out, plain_err, plain_code # Keep sudo as preferred for subsequent commands (best chance to avoid docker.sock issues) use_sudo = True merged_err = (plain_err or "").strip() if sudo_err: merged_err = (merged_err + "\n" if merged_err else "") + f"sudo_err: {(sudo_err or '').strip()}" return plain_out, merged_err, plain_code # Get Docker version base_version_cmd = "docker version --format '{{.Server.Version}}'" version_out, version_err, version_code = await run_docker(base_version_cmd) if version_code != 0: err = (version_err or "").strip() if "permission denied" in err.lower() and "docker.sock" in err.lower(): raise DockerNotAvailableError( "Docker not available: permission denied while trying to connect to the Docker daemon socket. " "Fix one of: set SSH_REMOTE_USER=root, add the SSH user to the 'docker' group, " "or allow passwordless sudo for docker (sudo -n). " f"Raw error: {err}" ) raise DockerNotAvailableError(f"Docker not available: {err}") docker_version = version_out.strip() result["docker_version"] = docker_version print(f"{LOG_PREFIX} {host.name}: Docker version {docker_version}, using sudo={use_sudo}") # Collect containers containers_out, _, _ = await run_docker( "docker ps -a --format '{{json .}}' --no-trunc" ) containers_data = await self._parse_docker_json_lines(containers_out) print(f"{LOG_PREFIX} {host.name}: Found {len(containers_data)} container(s)") # Collect images images_out, _, _ = await run_docker( "docker images --format '{{json .}}'" ) images_data = await self._parse_docker_json_lines(images_out) print(f"{LOG_PREFIX} {host.name}: Found {len(images_data)} image(s)") # Collect volumes volumes_out, _, _ = await run_docker( "docker volume ls --format '{{json .}}'" ) volumes_data = await self._parse_docker_json_lines(volumes_out) print(f"{LOG_PREFIX} {host.name}: Found {len(volumes_data)} volume(s)") finally: conn.close() # Store in database container_repo = DockerContainerRepository(session) image_repo = DockerImageRepository(session) volume_repo = DockerVolumeRepository(session) # Process containers container_ids = [] for c in containers_data: container_id = c.get("ID", "") container_ids.append(container_id) # Parse created time created_at = None created_str = c.get("CreatedAt", "") if created_str: try: # Docker format: "2024-01-15 10:30:00 -0500 EST" created_at = datetime.fromisoformat( created_str.split(" ")[0] + "T" + created_str.split(" ")[1] ) except (ValueError, IndexError): pass # Extract compose project from labels compose_project = None labels = c.get("Labels", "") if labels: for label in labels.split(","): if "com.docker.compose.project=" in label: compose_project = label.split("=", 1)[1] break # Parse labels into dict labels_dict = {} if labels: for label in labels.split(","): if "=" in label: k, v = label.split("=", 1) labels_dict[k] = v # Parse ports ports_str = c.get("Ports", "") ports = {"raw": ports_str} if ports_str else None # Determine health from status status = c.get("Status", "") health = None if "(healthy)" in status.lower(): health = "healthy" elif "(unhealthy)" in status.lower(): health = "unhealthy" elif "(health: starting)" in status.lower(): health = "starting" await container_repo.upsert( host_id=host_id, container_id=container_id, name=c.get("Names", "").lstrip("/"), image=c.get("Image", ""), state=c.get("State", "unknown").lower(), status=status, health=health, created_at=created_at, ports=ports, labels=labels_dict if labels_dict else None, compose_project=compose_project ) # Remove stale containers await container_repo.delete_stale(host_id, container_ids) result["containers_count"] = len(containers_data) # Process images image_ids = [] for img in images_data: image_id = img.get("ID", "") image_ids.append(image_id) # Parse size (e.g., "150MB" -> bytes) size_str = img.get("Size", "0") size = self._parse_size(size_str) # Parse created time created = None created_str = img.get("CreatedAt", "") if created_str: try: created = datetime.fromisoformat( created_str.split(" ")[0] + "T" + created_str.split(" ")[1] ) except (ValueError, IndexError): pass # Get repo tags repo = img.get("Repository", "") tag = img.get("Tag", "") repo_tags = None if repo != "": repo_tags = [f"{repo}:{tag}"] await image_repo.upsert( host_id=host_id, image_id=image_id, repo_tags=repo_tags, size=size, created=created ) await image_repo.delete_stale(host_id, image_ids) result["images_count"] = len(images_data) # Process volumes volume_names = [] for vol in volumes_data: name = vol.get("Name", "") volume_names.append(name) await volume_repo.upsert( host_id=host_id, name=name, driver=vol.get("Driver", "local"), mountpoint=vol.get("Mountpoint", ""), scope=vol.get("Scope", "local") ) await volume_repo.delete_stale(host_id, volume_names) result["volumes_count"] = len(volumes_data) # Update host Docker status await host_repo.update( host, docker_version=docker_version, docker_status="online", docker_last_collect_at=datetime.now(timezone.utc) ) await session.commit() result["success"] = True result["message"] = "Collection completed" duration_ms = int((time.time() - start_time) * 1000) print(f"{LOG_PREFIX} {host.name}: Collection completed in {duration_ms}ms - {len(containers_data)} containers, {len(images_data)} images, {len(volumes_data)} volumes") except SSHConnectionError as e: result["error"] = str(e) result["message"] = "Collection failed" # Update host status to offline await host_repo.update(host, docker_status="offline") await session.commit() print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: SSH connection error - {e}") logger.warning(f"{LOG_PREFIX} SSH error collecting Docker from {host_id}: {e}") except DockerNotAvailableError as e: result["error"] = str(e) result["message"] = "Collection failed" await host_repo.update(host, docker_status="error") await session.commit() print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: Docker not available - {e}") logger.warning(f"{LOG_PREFIX} Docker not available on {host_id}: {e}") except Exception as e: result["error"] = str(e) result["message"] = "Collection failed" import traceback tb = traceback.format_exc() print(f"{LOG_PREFIX} {result.get('host_name', host_id)}: Collection error - {e}") logger.error(f"{LOG_PREFIX} Error collecting Docker from {host_id}: {e}\n{tb}") await session.rollback() result["duration_ms"] = int((time.time() - start_time) * 1000) if not result.get("message"): result["message"] = "Collection completed" if result.get("success") else "Collection failed" return result async def collect_all_hosts(self) -> Dict[str, Any]: """Collect Docker information from all enabled hosts. Returns: Summary of collection results """ results = { "success": True, "total_hosts": 0, "successful": 0, "failed": 0, "results": [] } async with async_session_maker() as session: host_repo = HostRepository(session) hosts = await host_repo.list_docker_enabled() results["total_hosts"] = len(hosts) # Collect from all hosts concurrently (with limit) semaphore = asyncio.Semaphore(5) # Max 5 concurrent collections async def collect_with_semaphore(host): async with semaphore: return await self.collect_docker_host(host.id) if hosts: host_results = await asyncio.gather( *[collect_with_semaphore(host) for host in hosts], return_exceptions=True ) for r in host_results: if isinstance(r, Exception): results["failed"] += 1 results["results"].append({ "success": False, "host_id": "", "host_name": "", "message": "Collection failed", "docker_version": None, "containers_count": 0, "images_count": 0, "volumes_count": 0, "duration_ms": 0, "error": str(r) }) elif r.get("success"): results["successful"] += 1 results["results"].append(r) else: results["failed"] += 1 results["results"].append(r) results["success"] = results["failed"] == 0 return results def _parse_size(self, size_str: str) -> int: """Parse Docker size string to bytes.""" if not size_str: return 0 size_str = size_str.upper().strip() multipliers = { "TB": 1024 ** 4, "GB": 1024 ** 3, "MB": 1024 ** 2, "KB": 1024, "B": 1, } # Important: match longer suffixes first, otherwise 'B' would match 'MB' for suffix in ("TB", "GB", "MB", "KB", "B"): mult = multipliers[suffix] if size_str.endswith(suffix): try: num = float(size_str[:-len(suffix)]) return int(num * mult) except ValueError: return 0 try: return int(size_str) except ValueError: return 0 async def get_docker_hosts(self) -> List[Dict[str, Any]]: """Get all hosts with Docker info.""" async with async_session_maker() as session: host_repo = HostRepository(session) container_repo = DockerContainerRepository(session) image_repo = DockerImageRepository(session) volume_repo = DockerVolumeRepository(session) alert_repo = DockerAlertRepository(session) # Best-effort: read inventory and only show hosts in role_docker group. # Visibility is controlled by inventory membership; docker_enabled remains a user toggle. docker_host_names: Optional[set[str]] = None try: inv_hosts = ansible_service.get_hosts_from_inventory() docker_host_names = { h.name for h in inv_hosts if "role_docker" in (getattr(h, "groups", None) or []) } except Exception: # Inventory read is best-effort; do not break Docker UI docker_host_names = None # Return all active hosts so the UI can show both enabled and disabled # Docker monitoring states (pause/play). hosts = await host_repo.list(limit=1000) hosts = [h for h in hosts if not h.deleted_at] if docker_host_names is not None: hosts = [h for h in hosts if h.name in docker_host_names] hosts = sorted(hosts, key=lambda h: (h.name or "")) result = [] for host in hosts: if host.deleted_at: continue containers = await container_repo.count_by_host(host.id) images = await image_repo.count_by_host(host.id) volumes = await volume_repo.count_by_host(host.id) open_alerts = await alert_repo.count_open_by_host(host.id) result.append({ "host_id": host.id, "host_name": host.name, "host_ip": host.ip_address, "docker_enabled": host.docker_enabled, "docker_version": host.docker_version, "docker_status": host.docker_status, "docker_last_collect_at": host.docker_last_collect_at, "containers_total": containers.get("total", 0), "containers_running": containers.get("running", 0), "images_total": images.get("total", 0), "volumes_total": volumes, "open_alerts": open_alerts }) return result async def enable_docker_monitoring(self, host_id: str, enabled: bool = True) -> bool: """Enable or disable Docker monitoring on a host.""" async with async_session_maker() as session: host_repo = HostRepository(session) host = await host_repo.get(host_id) if not host: return False await host_repo.update(host, docker_enabled=enabled) await session.commit() # If enabling, trigger an immediate collection if enabled: asyncio.create_task(self.collect_docker_host(host_id)) return True # Singleton instance docker_service = DockerService()