keep error pages with fallback content

2026-04-15 10:36:21 +02:00 · 2026-04-15 10:36:21 +02:00 · 8e9289998b
commit 8e9289998b
parent 4e473ba2c9
2 changed files with 63 additions and 14 deletions
--- a/extract_content.py
+++ b/extract_content.py
@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
    "border",
 ]

+ERROR_PAGE_PATTERNS = [
+    # 5xx
+    "503 service unavailable",
+    "502 bad gateway",
+    "500 internal server error",
+    "504 gateway time",
+
+    # 4xx
+    "400 bad request",
+    "401 unauthorized",
+    "403 forbidden",
+    "404 not found",
+    "408 request time",
+    "419 page expired",
+    "429 too many requests",
+
+    # génériques
+    "temporarily busy",
+    "server error",
+    "internal error",
+    "page not found",
+    "request could not be satisfied",
+]
+
 # ======================
 # HELPERS
 # ======================
@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
        if getattr(el, "name", None) == "p":
            break

+def is_error_page(soup: BeautifulSoup) -> bool:
+    text = soup.get_text(" ", strip=True).lower()
+    return any(p in text for p in ERROR_PAGE_PATTERNS)
+
+def build_fallback_html(title: str, filename: str) -> str:
+    safe_title = title or filename.replace("_", " ").replace(".html", "")
+
+    return f"""<html>
+<head>
+  <meta charset="utf-8">
+  <title>{safe_title}</title>
+</head>
+<body>
+  <h1>{safe_title}</h1>
+  <p>Lost content (HTTracker) in {filename}</p>
+</body>
+</html>
+"""
+
 # ======================
 # CORE FUNCTIONS
 # ======================

 def clean_html_file(input_path: Path, output_path: Path):
-    html = input_path.read_text(encoding="utf-8", errors="ignore")
-    soup = BeautifulSoup(html, "html.parser")
+    html_page = input_path.read_text(encoding="utf-8", errors="ignore")
+    soup = BeautifulSoup(html_page, "html.parser")

    # Remove comments (HTTrack etc.)
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
        for el in soup.find_all(tag):
            el.decompose()

-
    # Extract main content
    content = soup.select_one("#mw-content-text")
-    if not content:
+    if not content or is_error_page(soup):
        print(f"[WARN] No content in {input_path.name}")
+        fallback = build_fallback_html(
+            title=soup.title.get_text(strip=True) if soup.title else "",
+            filename=input_path.name
+        )
+        output_path.write_text(fallback, encoding="utf-8")
        return
    remove_intro_rule_box(content)

@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):

 def process_all():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
-    for file in SOURCE_DIR.glob("*.html"):
+    files = list(SOURCE_DIR.glob("*.html"))
+    total = len(files)
+    print(f"{total} fichiers trouvés")
+    for i, file in enumerate(files, start=1):
        output_file = OUTPUT_DIR / file.name
        clean_html_file(file, output_file)
-
+        if i % 200 == 0 or i == total:
+            print(f"{i}/{total} analysés ({i/total:.1%})")
    print("✅ Cleaning complete")