ObsiGate/backend/secret_redactor.py

"""
Secret redactor: masks sensitive patterns in rendered text.

Scans for common secret patterns and replaces them with [MASQUÉ]
before content is served to the frontend. Prevents accidental
exposure of API keys, tokens, and passwords in previews.

Patterns detected:
    - Generic API keys (long alphanumeric strings with key/secret/token prefix)
    - JWT tokens (eyJ... base64url)
    - AWS-style keys (AKIA..., sk-..., etc.)
    - Private key blocks (-----BEGIN ... PRIVATE KEY-----)
    - Connection strings with passwords
"""

import re
import logging

logger = logging.getLogger("obsigate.redactor")

# --- Patterns ---
# Order matters: more specific patterns first
_PATTERNS = [
    # Private key blocks
    (re.compile(r'-----BEGIN (?:RSA |EC |DSA |OPENSSH |ENCRYPTED )?PRIVATE KEY-----.*?-----END (?:RSA |EC |DSA |OPENSSH |ENCRYPTED )?PRIVATE KEY-----', re.DOTALL), '[CLÉ PRIVÉE MASQUÉE]'),

    # JWT tokens (base64url encoded, starts with eyJ)
    (re.compile(r'eyJ[a-zA-Z0-9_-]{20,}\.[a-zA-Z0-9_-]{20,}\.[a-zA-Z0-9_-]{20,}'), '[JWT MASQUÉ]'),

    # Connection strings with passwords
    (re.compile(r'(?:mongodb|mysql|postgres(?:ql)?|redis|sqlite)://[^:]+:[^@\s]+@'), '[CONNECTION_STRING MASQUÉE]'),

    # Generic API key patterns: key=... or token=... or secret=...
    (re.compile(r'(?:api[_-]?key|apikey|secret|token|password|passwd|auth[_-]?token)\s*[:=]\s*[\'"]?([^\s\'"]{20,})[\'"]?', re.IGNORECASE),
     lambda m: f'{m.group(0).split("=")[0].split(":")[0]}=[MASQUÉ]' if "=" in m.group(0) or ":" in m.group(0) else '[MASQUÉ]'),

    # Generic long hex/base64 strings that look like secrets (40+ chars)
    (re.compile(r'(?:sk|pk|rk)-[a-zA-Z0-9]{20,}'), '[CLÉ API MASQUÉE]'),

    # AWS access keys
    (re.compile(r'AKIA[0-9A-Z]{16}'), '[AWS_KEY MASQUÉ]'),

    # GitHub tokens (ghp_, gho_, ghu_, ghs_, ghr_)
    (re.compile(r'gh[pousr]_[a-zA-Z0-9]{36,}'), '[GITHUB_TOKEN MASQUÉ]'),

    # Generic long random-looking strings (40+ hex chars)
    (re.compile(r'\b[a-fA-F0-9]{40,64}\b'), '[HEX_KEY MASQUÉ]'),
]


def redact(text: str) -> tuple:
    """Redact sensitive patterns from text.

    Args:
        text: The raw text content to scan.

    Returns:
        (redacted_text, redaction_count) tuple.
    """
    count = 0
    result = text
    for pattern, replacement in _PATTERNS:
        if callable(replacement):
            new_result, n = pattern.subn(replacement, result)
        else:
            new_result, n = pattern.subn(str(replacement), result)
        count += n
        result = new_result
    if count > 0:
        logger.info(f"Redacted {count} secret(s) from content")
    return result, count


def redact_file_content(content: str, file_path: str = "") -> str:
    """Redact a file's content for preview rendering.

    Args:
        content: Raw file content.
        file_path: Optional file path for logging context.

    Returns:
        Redacted content string.
    """
    redacted, count = redact(content)
    if count > 0:
        logger.warning(f"Redacted {count} potential secret(s) from {file_path or '<unknown>'}")
    return redacted