"""
Parser HTML vers Markdown pour la documentation d'aide.
"""
import re
from html.parser import HTMLParser
from pathlib import Path
from typing import List, Tuple
class HelpHtmlToMarkdownParser(HTMLParser):
"""Parser pour convertir le HTML de documentation en Markdown."""
def __init__(self):
super().__init__()
self.markdown_lines: List[str] = []
self.current_text = ""
self.in_code = False
self.in_pre = False
self.in_list = False
self.list_type = "ul"
self.list_level = 0
self.in_table = False
self.table_row: List[str] = []
self.in_th = False
self.ignore_content = False
self.tag_stack: List[str] = []
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.tag_stack.append(tag)
attrs_dict = dict(attrs)
if tag in ["script", "style", "nav", "footer"]:
self.ignore_content = True
return
if tag == "h1":
self._flush_text()
self.markdown_lines.append("\n# ")
elif tag == "h2":
self._flush_text()
self.markdown_lines.append("\n## ")
elif tag == "h3":
self._flush_text()
self.markdown_lines.append("\n### ")
elif tag == "h4":
self._flush_text()
self.markdown_lines.append("\n#### ")
elif tag == "p":
self._flush_text()
self.markdown_lines.append("\n")
elif tag == "br":
self.markdown_lines.append("\n")
elif tag == "strong" or tag == "b":
self.current_text += "**"
elif tag == "em" or tag == "i":
self.current_text += "*"
elif tag == "code":
if not self.in_pre:
self.current_text += "`"
self.in_code = True
elif tag == "pre":
self._flush_text()
self.in_pre = True
self.markdown_lines.append("\n```\n")
elif tag == "ul":
self._flush_text()
self.in_list = True
self.list_type = "ul"
self.list_level += 1
elif tag == "ol":
self._flush_text()
self.in_list = True
self.list_type = "ol"
self.list_level += 1
elif tag == "li":
self._flush_text()
indent = " " * (self.list_level - 1)
if self.list_type == "ul":
self.markdown_lines.append(f"\n{indent}- ")
else:
self.markdown_lines.append(f"\n{indent}1. ")
elif tag == "a":
href = attrs_dict.get("href", "")
self.current_text += "["
self._href_pending = href
elif tag == "table":
self._flush_text()
self.in_table = True
self.markdown_lines.append("\n")
elif tag == "tr":
self.table_row = []
elif tag == "th":
self.in_th = True
elif tag == "td":
pass
elif tag == "hr":
self._flush_text()
self.markdown_lines.append("\n---\n")
elif tag == "blockquote":
self._flush_text()
self.markdown_lines.append("\n> ")
def handle_endtag(self, tag: str):
if self.tag_stack and self.tag_stack[-1] == tag:
self.tag_stack.pop()
if tag in ["script", "style", "nav", "footer"]:
self.ignore_content = False
return
if tag in ["h1", "h2", "h3", "h4"]:
self._flush_text()
self.markdown_lines.append("\n")
elif tag == "p":
self._flush_text()
self.markdown_lines.append("\n")
elif tag == "strong" or tag == "b":
self.current_text += "**"
elif tag == "em" or tag == "i":
self.current_text += "*"
elif tag == "code":
if not self.in_pre:
self.current_text += "`"
self.in_code = False
elif tag == "pre":
self._flush_text()
self.in_pre = False
self.markdown_lines.append("```\n")
elif tag == "ul" or tag == "ol":
self._flush_text()
self.list_level -= 1
if self.list_level == 0:
self.in_list = False
self.markdown_lines.append("\n")
elif tag == "li":
self._flush_text()
elif tag == "a":
href = getattr(self, "_href_pending", "")
self.current_text += f"]({href})"
self._href_pending = ""
elif tag == "tr":
if self.table_row:
self.markdown_lines.append("| " + " | ".join(self.table_row) + " |\n")
if self.in_th:
# Ajouter la ligne de séparation après les en-têtes
self.markdown_lines.append("|" + "|".join(["---"] * len(self.table_row)) + "|\n")
self.in_th = False
elif tag == "th" or tag == "td":
self._flush_text()
self.table_row.append(self.current_text.strip())
self.current_text = ""
elif tag == "table":
self.in_table = False
self.markdown_lines.append("\n")
elif tag == "blockquote":
self._flush_text()
self.markdown_lines.append("\n")
def handle_data(self, data: str):
if self.ignore_content:
return
if self.in_pre:
self.markdown_lines.append(data)
else:
# Normaliser les espaces
text = " ".join(data.split())
if text:
self.current_text += text
def _flush_text(self):
if self.current_text.strip():
self.markdown_lines.append(self.current_text.strip())
self.current_text = ""
def get_markdown(self) -> str:
self._flush_text()
content = "".join(self.markdown_lines)
# Nettoyer les lignes vides multiples
content = re.sub(r'\n{3,}', '\n\n', content)
return content.strip()
def build_help_markdown(html_path: Path = None, html_content: str = None) -> str:
"""Construit le contenu Markdown d'aide depuis un fichier HTML.
Args:
html_path: Chemin vers le fichier HTML source
html_content: Contenu HTML direct (prioritaire sur html_path)
Returns:
Contenu Markdown formaté
"""
if html_content:
content = html_content
elif html_path and html_path.exists():
content = html_path.read_text(encoding='utf-8')
else:
return _get_default_help_markdown()
# Extraire uniquement la section d'aide si présente
help_section_match = re.search(
r'