homelab_automation/app/services/startup_checks.py

682 lines
27 KiB
Python

"""
Service de vérification des prérequis au démarrage de l'application.
Valide les dépendances externes, les clés SSH, et le fonctionnement d'Ansible.
"""
import asyncio
import os
import shutil
import subprocess
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional, Tuple
from enum import Enum
class CheckStatus(Enum):
"""Statut d'une vérification"""
OK = "ok"
WARNING = "warning"
ERROR = "error"
SKIPPED = "skipped"
@dataclass
class CheckResult:
"""Résultat d'une vérification individuelle"""
name: str
status: CheckStatus
message: str
details: Optional[str] = None
@dataclass
class StartupCheckReport:
"""Rapport complet des vérifications au démarrage"""
results: List[CheckResult] = field(default_factory=list)
@property
def has_errors(self) -> bool:
return any(r.status == CheckStatus.ERROR for r in self.results)
@property
def has_warnings(self) -> bool:
return any(r.status == CheckStatus.WARNING for r in self.results)
@property
def all_ok(self) -> bool:
return all(r.status in (CheckStatus.OK, CheckStatus.SKIPPED) for r in self.results)
def add(self, result: CheckResult):
self.results.append(result)
def print_report(self):
"""Affiche le rapport des vérifications dans la console"""
print("\n" + "=" * 60)
print("🔍 VÉRIFICATION DES PRÉREQUIS AU DÉMARRAGE")
print("=" * 60)
for result in self.results:
icon = self._get_status_icon(result.status)
print(f"{icon} {result.name}: {result.message}")
if result.details:
# Indenter les détails
for line in result.details.split('\n'):
if line.strip():
print(f" └─ {line}")
print("-" * 60)
if self.all_ok:
print("✅ Tous les prérequis sont satisfaits")
elif self.has_errors:
print("❌ Des erreurs critiques ont été détectées")
else:
print("⚠️ Des avertissements ont été détectés")
print("=" * 60 + "\n")
def _get_status_icon(self, status: CheckStatus) -> str:
icons = {
CheckStatus.OK: "",
CheckStatus.WARNING: "⚠️ ",
CheckStatus.ERROR: "",
CheckStatus.SKIPPED: "⏭️ ",
}
return icons.get(status, "")
class StartupChecksService:
"""Service de vérification des prérequis au démarrage"""
def __init__(
self,
ansible_dir: Path,
ssh_key_path: str,
ssh_user: str = "automation",
test_host: str = "localhost",
):
self.ansible_dir = ansible_dir
self.ssh_key_path = Path(ssh_key_path)
self.ssh_user = ssh_user
self.test_host = test_host
self.report = StartupCheckReport()
async def run_all_checks(self) -> StartupCheckReport:
"""Exécute toutes les vérifications et retourne le rapport"""
self.report = StartupCheckReport()
# 1. Vérification des packages Python requis
await self._check_python_packages()
# 2. Vérification des variables d'environnement
await self._check_env_vars()
# 3. Vérification des outils système (ansible, ssh)
await self._check_system_tools()
# 4. Vérification de la clé SSH
await self._check_ssh_key()
# 5. Vérification de la configuration Ansible
await self._check_ansible_config()
# 6. Vérification de l'inventaire Ansible
await self._check_ansible_inventory()
# 7. Test de connexion SSH vers localhost
await self._check_ssh_connection()
# 8. Test d'exécution Ansible (ping localhost)
await self._check_ansible_ping()
return self.report
async def _check_python_packages(self):
"""Vérifie que les packages Python requis sont installés"""
required_packages = [
("ansible", "ansible"),
("yaml", "pyyaml"),
("aiosqlite", "aiosqlite"),
("sqlalchemy", "sqlalchemy"),
("fastapi", "fastapi"),
("uvicorn", "uvicorn"),
("httpx", "httpx"),
("apscheduler", "apscheduler"),
]
missing = []
installed = []
for import_name, package_name in required_packages:
try:
__import__(import_name)
installed.append(package_name)
except ImportError:
missing.append(package_name)
if missing:
self.report.add(CheckResult(
name="Packages Python",
status=CheckStatus.ERROR,
message=f"{len(missing)} package(s) manquant(s)",
details=f"Manquants: {', '.join(missing)}"
))
else:
self.report.add(CheckResult(
name="Packages Python",
status=CheckStatus.OK,
message=f"{len(installed)} packages requis installés"
))
async def _check_env_vars(self):
"""Vérifie les variables d'environnement importantes et affiche leurs valeurs (sensibles masquées)."""
# Définition des variables à contrôler
# required=True indique qu'elles sont importantes pour la sécurité ou la config,
# même si le code a une valeur par défaut.
env_defs = [
# Sécurité / Auth
{"key": "API_KEY", "required": True, "sensitive": True, "dev_default": "dev-key-12345"},
{"key": "JWT_SECRET_KEY", "required": True, "sensitive": True, "dev_default": "homelab-secret-key-change-in-production"},
{"key": "JWT_EXPIRE_MINUTES", "required": False, "sensitive": False, "dev_default": "1440"},
# Base de données
{"key": "DATABASE_URL", "required": False, "sensitive": False, "dev_default": None},
{"key": "DB_PATH", "required": False, "sensitive": False, "dev_default": None},
# Logs et chemins
{"key": "LOGS_DIR", "required": False, "sensitive": False, "dev_default": "/logs"},
{"key": "DIR_LOGS_TASKS", "required": False, "sensitive": False, "dev_default": "./tasks_logs"},
# SSH / Ansible
{"key": "SSH_USER", "required": False, "sensitive": False, "dev_default": "automation"},
{"key": "SSH_REMOTE_USER", "required": False, "sensitive": False, "dev_default": "root"},
{"key": "SSH_KEY_PATH", "required": False, "sensitive": False, "dev_default": None},
{"key": "ANSIBLE_INVENTORY", "required": False, "sensitive": False, "dev_default": "./ansible/inventory"},
{"key": "ANSIBLE_PLAYBOOKS", "required": False, "sensitive": False, "dev_default": "./ansible/playbooks"},
{"key": "ANSIBLE_GROUP_VARS", "required": False, "sensitive": False, "dev_default": "./ansible/inventory/group_vars"},
# Notifications ntfy
{"key": "NTFY_BASE_URL", "required": False, "sensitive": False, "dev_default": "http://localhost:8150"},
{"key": "NTFY_DEFAULT_TOPIC", "required": False, "sensitive": False, "dev_default": "homelab-events"},
{"key": "NTFY_ENABLED", "required": False, "sensitive": False, "dev_default": "true"},
{"key": "NTFY_TIMEOUT", "required": False, "sensitive": False, "dev_default": "5"},
{"key": "NTFY_MSG_TYPE", "required": False, "sensitive": False, "dev_default": "ALL"},
{"key": "NTFY_USERNAME", "required": False, "sensitive": True, "dev_default": None},
{"key": "NTFY_PASSWORD", "required": False, "sensitive": True, "dev_default": None},
{"key": "NTFY_TOKEN", "required": False, "sensitive": True, "dev_default": None},
]
details_lines: List[str] = []
warnings = 0
errors = 0
for env_def in env_defs:
key = env_def["key"]
required = env_def["required"]
sensitive = env_def["sensitive"]
dev_default = env_def["dev_default"]
value = os.environ.get(key)
if value is None or value == "":
if required:
# Valeur manquante mais le code a généralement un fallback interne
warnings += 1
details_lines.append(f"{key}=<non défini> (valeur par défaut interne utilisée)")
else:
details_lines.append(f"{key}=<non défini>")
continue
# Il y a une valeur définie
display_value: str
if sensitive:
# Masquer les valeurs sensibles (clés, tokens, mots de passe)
if len(value) <= 4:
masked = "*" * len(value)
else:
masked = value[:2] + "***" + value[-2:]
display_value = masked
else:
display_value = value
# Détecter l'utilisation de valeurs de développement connues
if dev_default is not None and value == dev_default and required:
warnings += 1
details_lines.append(f"{key}={display_value} (valeur de DEV, à changer en production)")
else:
details_lines.append(f"{key}={display_value}")
# Si aucune ligne (cas improbable), éviter un message vide
if not details_lines:
details_lines.append("Aucune variable d'environnement spécifique détectée")
if errors > 0:
status = CheckStatus.ERROR
message = f"{errors} variable(s) d'environnement critique(s) manquante(s)"
elif warnings > 0:
status = CheckStatus.WARNING
message = f"{warnings} avertissement(s) de configuration d'environnement"
else:
status = CheckStatus.OK
message = "Variables d'environnement principales définies"
self.report.add(CheckResult(
name="Variables d'environnement",
status=status,
message=message,
details="\n".join(details_lines),
))
async def _check_system_tools(self):
"""Vérifie que les outils système requis sont disponibles"""
tools = {
"ansible": "ansible --version",
"ansible-playbook": "ansible-playbook --version",
"ssh": "ssh -V",
}
results = []
for tool, cmd in tools.items():
path = shutil.which(tool)
if path:
# Récupérer la version
try:
result = await asyncio.to_thread(
subprocess.run,
cmd.split(),
capture_output=True,
text=True,
timeout=10
)
# Combiner stdout et stderr pour trouver la version
output = result.stdout + result.stderr
# Chercher une ligne contenant une version
version_line = ""
# Patterns à ignorer (code Python, tracebacks, etc.)
skip_starts = ('Traceback', 'File', ' ', 'from ', 'import ', '~', '^',
'if ', 'def ', 'class ', 'return ', 'raise ', 'OSError', 'WinError')
for line in output.split('\n'):
line = line.strip()
# Ignorer les lignes de traceback, import, code Python, etc.
if line and not any(line.startswith(x) for x in skip_starts):
# Chercher des patterns de version
if any(x in line.lower() for x in ['version', 'openssh', 'core [']):
version_line = line[:60]
break
# Pattern spécifique pour ansible
if tool.startswith('ansible') and 'ansible' in line.lower() and '[' in line:
version_line = line[:60]
break
if not version_line:
# Prendre la première ligne non vide qui n'est pas du code
for line in output.split('\n'):
line = line.strip()
if line and not any(line.startswith(x) for x in skip_starts) and not any(x in line for x in ['(', ')', ':', '=']):
version_line = line[:60]
break
# Si toujours pas de version, juste indiquer que c'est installé
results.append((tool, True, version_line if version_line else f"installé à {path}"))
except Exception as e:
results.append((tool, True, f"installé à {path}"))
else:
results.append((tool, False, "non trouvé"))
missing = [r[0] for r in results if not r[1]]
if missing:
self.report.add(CheckResult(
name="Outils système",
status=CheckStatus.ERROR,
message=f"{len(missing)} outil(s) manquant(s): {', '.join(missing)}",
details="\n".join([f"{r[0]}: {r[2]}" for r in results])
))
else:
self.report.add(CheckResult(
name="Outils système",
status=CheckStatus.OK,
message="ansible, ansible-playbook, ssh disponibles",
details="\n".join([f"{r[0]}: {r[2]}" for r in results if r[1]])
))
async def _check_ssh_key(self):
"""Vérifie que la clé SSH est disponible et valide"""
# Vérifier si le fichier existe
if not self.ssh_key_path.exists():
self.report.add(CheckResult(
name="Clé SSH",
status=CheckStatus.ERROR,
message=f"Clé SSH non trouvée",
details=f"Chemin: {self.ssh_key_path}"
))
return
# Vérifier les permissions (sur Linux/Mac)
if os.name != 'nt': # Non-Windows
stat_info = self.ssh_key_path.stat()
mode = oct(stat_info.st_mode)[-3:]
if mode not in ('600', '400'):
self.report.add(CheckResult(
name="Clé SSH",
status=CheckStatus.WARNING,
message=f"Permissions incorrectes ({mode})",
details=f"Chemin: {self.ssh_key_path}\nPermissions recommandées: 600"
))
return
# Vérifier que c'est une clé valide
try:
result = await asyncio.to_thread(
subprocess.run,
["ssh-keygen", "-l", "-f", str(self.ssh_key_path)],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0:
key_info = result.stdout.strip()
self.report.add(CheckResult(
name="Clé SSH",
status=CheckStatus.OK,
message="Clé SSH valide",
details=f"Chemin: {self.ssh_key_path}\n{key_info}"
))
else:
self.report.add(CheckResult(
name="Clé SSH",
status=CheckStatus.ERROR,
message="Clé SSH invalide",
details=result.stderr.strip()
))
except FileNotFoundError:
# ssh-keygen non disponible (Windows sans OpenSSH)
self.report.add(CheckResult(
name="Clé SSH",
status=CheckStatus.OK,
message="Clé SSH présente (validation partielle)",
details=f"Chemin: {self.ssh_key_path}\nTaille: {self.ssh_key_path.stat().st_size} bytes"
))
except Exception as e:
self.report.add(CheckResult(
name="Clé SSH",
status=CheckStatus.WARNING,
message=f"Impossible de valider la clé: {str(e)}",
details=f"Chemin: {self.ssh_key_path}"
))
async def _check_ansible_config(self):
"""Vérifie la configuration Ansible"""
ansible_cfg = self.ansible_dir / "ansible.cfg"
if not ansible_cfg.exists():
self.report.add(CheckResult(
name="Configuration Ansible",
status=CheckStatus.WARNING,
message="Fichier ansible.cfg non trouvé",
details=f"Chemin attendu: {ansible_cfg}"
))
return
# Vérifier que le fichier est lisible et contient les sections essentielles
try:
content = ansible_cfg.read_text()
has_defaults = "[defaults]" in content
has_inventory = "inventory" in content
if has_defaults and has_inventory:
self.report.add(CheckResult(
name="Configuration Ansible",
status=CheckStatus.OK,
message="ansible.cfg valide",
details=f"Chemin: {ansible_cfg}"
))
else:
self.report.add(CheckResult(
name="Configuration Ansible",
status=CheckStatus.WARNING,
message="Configuration Ansible incomplète",
details=f"[defaults]: {'' if has_defaults else ''}, inventory: {'' if has_inventory else ''}"
))
except Exception as e:
self.report.add(CheckResult(
name="Configuration Ansible",
status=CheckStatus.ERROR,
message=f"Erreur lecture ansible.cfg: {str(e)}"
))
async def _check_ansible_inventory(self):
"""Vérifie l'inventaire Ansible"""
inventory_path = self.ansible_dir / "inventory" / "hosts.yml"
if not inventory_path.exists():
self.report.add(CheckResult(
name="Inventaire Ansible",
status=CheckStatus.ERROR,
message="Fichier d'inventaire non trouvé",
details=f"Chemin attendu: {inventory_path}"
))
return
try:
import yaml
content = inventory_path.read_text()
inventory = yaml.safe_load(content)
# Compter les hôtes
host_count = 0
group_count = 0
def count_hosts(data, depth=0):
nonlocal host_count, group_count
if isinstance(data, dict):
if 'hosts' in data and isinstance(data['hosts'], dict):
host_count += len(data['hosts'])
if 'children' in data:
group_count += len(data['children'])
for child in data['children'].values():
count_hosts(child, depth + 1)
count_hosts(inventory.get('all', {}))
self.report.add(CheckResult(
name="Inventaire Ansible",
status=CheckStatus.OK,
message=f"{host_count} hôte(s) dans {group_count} groupe(s)",
details=f"Chemin: {inventory_path}"
))
except Exception as e:
self.report.add(CheckResult(
name="Inventaire Ansible",
status=CheckStatus.ERROR,
message=f"Erreur lecture inventaire: {str(e)}"
))
async def _check_ssh_connection(self):
"""Teste la connexion SSH vers l'hôte de test"""
# Pour localhost, on utilise la connexion locale Ansible, pas SSH
if self.test_host == "localhost":
self.report.add(CheckResult(
name="Connexion SSH",
status=CheckStatus.SKIPPED,
message="Test SSH ignoré pour localhost",
details="Utilisation de la connexion locale Ansible"
))
return
# Vérifier d'abord que la clé SSH existe
if not self.ssh_key_path.exists():
self.report.add(CheckResult(
name="Connexion SSH",
status=CheckStatus.SKIPPED,
message="Test SSH ignoré (clé SSH non disponible)",
details=f"Clé manquante: {self.ssh_key_path}"
))
return
try:
# Test SSH avec timeout court
cmd = [
"ssh",
"-o", "StrictHostKeyChecking=no",
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=5",
"-i", str(self.ssh_key_path),
f"{self.ssh_user}@{self.test_host}",
"echo", "SSH_OK"
]
result = await asyncio.to_thread(
subprocess.run,
cmd,
capture_output=True,
text=True,
timeout=15
)
if result.returncode == 0 and "SSH_OK" in result.stdout:
self.report.add(CheckResult(
name="Connexion SSH",
status=CheckStatus.OK,
message=f"Connexion SSH vers {self.test_host} réussie",
details=f"Utilisateur: {self.ssh_user}"
))
else:
error_msg = result.stderr.strip() if result.stderr else "Erreur inconnue"
self.report.add(CheckResult(
name="Connexion SSH",
status=CheckStatus.WARNING,
message=f"Connexion SSH vers {self.test_host} échouée",
details=f"Erreur: {error_msg[:100]}"
))
except subprocess.TimeoutExpired:
self.report.add(CheckResult(
name="Connexion SSH",
status=CheckStatus.WARNING,
message=f"Timeout connexion SSH vers {self.test_host}",
details="La connexion a dépassé le délai de 15 secondes"
))
except Exception as e:
self.report.add(CheckResult(
name="Connexion SSH",
status=CheckStatus.WARNING,
message=f"Test SSH non effectué: {str(e)}"
))
async def _check_ansible_ping(self):
"""Teste le ping Ansible vers l'hôte de test"""
try:
# Pour localhost, utiliser connexion locale (pas besoin de SSH)
if self.test_host == "localhost":
cmd = [
"ansible",
self.test_host,
"-m", "ping",
"-i", str(self.ansible_dir / "inventory" / "hosts.yml"),
"-c", "local", # Connexion locale
"-o", # One-line output
]
else:
cmd = [
"ansible",
self.test_host,
"-m", "ping",
"-i", str(self.ansible_dir / "inventory" / "hosts.yml"),
"--private-key", str(self.ssh_key_path),
"-u", self.ssh_user,
"-o", # One-line output
]
result = await asyncio.to_thread(
subprocess.run,
cmd,
capture_output=True,
text=True,
timeout=30,
cwd=str(self.ansible_dir)
)
if result.returncode == 0 and "SUCCESS" in result.stdout:
self.report.add(CheckResult(
name="Ansible Ping",
status=CheckStatus.OK,
message=f"Ansible ping vers {self.test_host} réussi",
details="Module ping exécuté avec succès"
))
else:
# Extraire le message d'erreur pertinent (filtrer les tracebacks Python)
error_output = result.stdout + result.stderr
# Détecter les erreurs Windows spécifiques
if 'WinError' in error_output or 'blocking_io' in error_output.lower():
self.report.add(CheckResult(
name="Ansible Ping",
status=CheckStatus.WARNING,
message=f"Ansible non compatible avec cet environnement Windows",
details="Ansible fonctionne mieux sous WSL ou Linux"
))
return
# Filtrer les lignes de traceback et garder les messages utiles
useful_lines = []
skip_patterns = ('Traceback', 'File ', ' File', ' ', 'from ', 'import ',
'~', '^', 'check_', 'if ', 'def ', 'OSError', 'raise ')
for line in error_output.split('\n'):
line = line.strip()
if line and not any(line.startswith(p) for p in skip_patterns):
# Garder les lignes d'erreur Ansible ou messages pertinents
if any(x in line.lower() for x in ['error', 'failed', 'unreachable', 'fatal', 'msg:', 'permission']):
useful_lines.append(line[:80])
if len(useful_lines) >= 2:
break
error_detail = "\n".join(useful_lines) if useful_lines else "Vérifiez la configuration Ansible"
self.report.add(CheckResult(
name="Ansible Ping",
status=CheckStatus.WARNING,
message=f"Ansible ping vers {self.test_host} échoué",
details=error_detail
))
except subprocess.TimeoutExpired:
self.report.add(CheckResult(
name="Ansible Ping",
status=CheckStatus.WARNING,
message="Timeout Ansible ping",
details="L'exécution a dépassé 30 secondes"
))
except Exception as e:
self.report.add(CheckResult(
name="Ansible Ping",
status=CheckStatus.WARNING,
message=f"Test Ansible non effectué: {str(e)}"
))
# Instance globale du service (sera configurée au démarrage)
startup_checks_service: Optional[StartupChecksService] = None
async def run_startup_checks(
ansible_dir: Path,
ssh_key_path: str,
ssh_user: str = "automation",
test_host: str = "localhost",
) -> StartupCheckReport:
"""
Fonction utilitaire pour exécuter les vérifications au démarrage.
Args:
ansible_dir: Chemin vers le répertoire Ansible
ssh_key_path: Chemin vers la clé SSH privée
ssh_user: Utilisateur SSH pour les tests
test_host: Hôte de test pour les connexions SSH/Ansible
Returns:
StartupCheckReport: Rapport des vérifications
"""
global startup_checks_service
startup_checks_service = StartupChecksService(
ansible_dir=ansible_dir,
ssh_key_path=ssh_key_path,
ssh_user=ssh_user,
test_host=test_host,
)
report = await startup_checks_service.run_all_checks()
report.print_report()
return report