diff --git a/extract_content.py b/extract_content.py
index 18ceabc..7d0b1df 100644
--- a/extract_content.py
+++ b/extract_content.py
@@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
     "border",
 ]
 
+ERROR_PAGE_PATTERNS = [
+    # 5xx
+    "503 service unavailable",
+    "502 bad gateway",
+    "500 internal server error",
+    "504 gateway time",
+
+    # 4xx
+    "400 bad request",
+    "401 unauthorized",
+    "403 forbidden",
+    "404 not found",
+    "408 request time",
+    "419 page expired",
+    "429 too many requests",
+
+    # génériques
+    "temporarily busy",
+    "server error",
+    "internal error",
+    "page not found",
+    "request could not be satisfied",
+]
+
 # ======================
 # HELPERS
 # ======================
@@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
         if getattr(el, "name", None) == "p":
             break
 
+def is_error_page(soup: BeautifulSoup) -> bool:
+    text = soup.get_text(" ", strip=True).lower()
+    return any(p in text for p in ERROR_PAGE_PATTERNS)
+
+def build_fallback_html(title: str, filename: str) -> str:
+    safe_title = title or filename.replace("_", " ").replace(".html", "")
+
+    return f"""<html>
+<head>
+  <meta charset="utf-8">
+  <title>{safe_title}</title>
+</head>
+<body>
+  <h1>{safe_title}</h1>
+  <p>Lost content (HTTracker) in {filename}</p>
+</body>
+</html>
+"""
+
 # ======================
 # CORE FUNCTIONS
 # ======================
 
 def clean_html_file(input_path: Path, output_path: Path):
-    html = input_path.read_text(encoding="utf-8", errors="ignore")
-    soup = BeautifulSoup(html, "html.parser")
+    html_page = input_path.read_text(encoding="utf-8", errors="ignore")
+    soup = BeautifulSoup(html_page, "html.parser")
 
     # Remove comments (HTTrack etc.)
     for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
@@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
         for el in soup.find_all(tag):
             el.decompose()
 
-
     # Extract main content
     content = soup.select_one("#mw-content-text")
-    if not content:
+    if not content or is_error_page(soup):
         print(f"[WARN] No content in {input_path.name}")
+        fallback = build_fallback_html(
+            title=soup.title.get_text(strip=True) if soup.title else "",
+            filename=input_path.name
+        )
+        output_path.write_text(fallback, encoding="utf-8")
         return
     remove_intro_rule_box(content)
 
@@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):
 
 def process_all():
     OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-    for file in SOURCE_DIR.glob("*.html"):
+    files = list(SOURCE_DIR.glob("*.html"))
+    total = len(files)
+    print(f"{total} fichiers trouvés")
+    for i, file in enumerate(files, start=1):
         output_file = OUTPUT_DIR / file.name
         clean_html_file(file, output_file)
-
+        if i % 200 == 0 or i == total:
+            print(f"{i}/{total} analysés ({i/total:.1%})")
     print("✅ Cleaning complete")
 
 
diff --git a/scan_internal_links.py b/scan_internal_links.py
index 0858a57..25fc6d7 100644
--- a/scan_internal_links.py
+++ b/scan_internal_links.py
@@ -3,6 +3,7 @@ import json
 import re
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, parse_qs, unquote
+import unicodedata
 
 # --------------------------------------------------
 # PATHS
@@ -29,16 +30,14 @@ valid_targets = set(canonical_pages.values())
 # HELPERS
 # --------------------------------------------------
 
-def normalize_title(title: str | None):
-    if not title:
-        return None
-
-    title = unquote(title)
+def normalize_title(title: str) -> str:
+    title = title.strip()
+    title = unicodedata.normalize("NFKC", title)
     title = title.replace("_", " ")
-    title = re.sub(r"\s+", " ", title.strip())
+    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
+    title = re.sub(r"\s+", " ", title)
     return title.casefold()
 
-
 # -------------------------
 # Extract MediaWiki target
 # -------------------------