keep error pages with fallback content

2026-04-15 10:36:21 +02:00 · 2026-04-15 10:36:21 +02:00 · 8e9289998b
commit 8e9289998b
parent 4e473ba2c9
2 changed files with 63 additions and 14 deletions
--- a/extract_content.py
+++ b/extract_content.py
@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
    "border",
 ]
 ERROR_PAGE_PATTERNS = [
    # 5xx
    "503 service unavailable",
    "502 bad gateway",
    "500 internal server error",
    "504 gateway time",
    # 4xx
    "400 bad request",
    "401 unauthorized",
    "403 forbidden",
    "404 not found",
    "408 request time",
    "419 page expired",
    "429 too many requests",
    # génériques
    "temporarily busy",
    "server error",
    "internal error",
    "page not found",
    "request could not be satisfied",
 ]
 # ======================
 # HELPERS
 # ======================
@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
        if getattr(el, "name", None) == "p":
            break
 def is_error_page(soup: BeautifulSoup) -> bool:
    text = soup.get_text(" ", strip=True).lower()
    return any(p in text for p in ERROR_PAGE_PATTERNS)
 def build_fallback_html(title: str, filename: str) -> str:
    safe_title = title or filename.replace("_", " ").replace(".html", "")
    return f"""<html>
 <head>
  <meta charset="utf-8">
  <title>{safe_title}</title>
 </head>
 <body>
  <h1>{safe_title}</h1>
  <p>Lost content (HTTracker) in {filename}</p>
 </body>
 </html>
 """
 # ======================
 # CORE FUNCTIONS
 # ======================
 def clean_html_file(input_path: Path, output_path: Path):
-    html = input_path.read_text(encoding="utf-8", errors="ignore")
+    html_page = input_path.read_text(encoding="utf-8", errors="ignore")
-    soup = BeautifulSoup(html, "html.parser")
+    soup = BeautifulSoup(html_page, "html.parser")
    # Remove comments (HTTrack etc.)
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
        for el in soup.find_all(tag):
            el.decompose()
    # Extract main content
    content = soup.select_one("#mw-content-text")
-    if not content:
+    if not content or is_error_page(soup):
        print(f"[WARN] No content in {input_path.name}")
        fallback = build_fallback_html(
            title=soup.title.get_text(strip=True) if soup.title else "",
            filename=input_path.name
        )
        output_path.write_text(fallback, encoding="utf-8")
        return
    remove_intro_rule_box(content)
@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):
 def process_all():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
+    files = list(SOURCE_DIR.glob("*.html"))
-    for file in SOURCE_DIR.glob("*.html"):
+    total = len(files)
    print(f"{total} fichiers trouvés")
    for i, file in enumerate(files, start=1):
        output_file = OUTPUT_DIR / file.name
        clean_html_file(file, output_file)
-
+        if i % 200 == 0 or i == total:
            print(f"{i}/{total} analysés ({i/total:.1%})")
    print("✅ Cleaning complete")
--- a/scan_internal_links.py
+++ b/scan_internal_links.py
@ -3,6 +3,7 @@ import json
 import re
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, parse_qs, unquote
 import unicodedata
 # --------------------------------------------------
 # PATHS
@ -29,16 +30,14 @@ valid_targets = set(canonical_pages.values())
 # HELPERS
 # --------------------------------------------------
-def normalize_title(title: str | None):
+def normalize_title(title: str) -> str:
-    if not title:
+    title = title.strip()
-        return None
+    title = unicodedata.normalize("NFKC", title)
    title = unquote(title)
    title = title.replace("_", " ")
-    title = re.sub(r"\s+", " ", title.strip())
+    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    title = re.sub(r"\s+", " ", title)
    return title.casefold()
 # -------------------------
 # Extract MediaWiki target
 # -------------------------