keep error pages with fallback content

This commit is contained in:
Maxime Réaux 2026-04-15 10:36:21 +02:00
parent 4e473ba2c9
commit 8e9289998b
2 changed files with 63 additions and 14 deletions

View file

@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
"border", "border",
] ]
ERROR_PAGE_PATTERNS = [
# 5xx
"503 service unavailable",
"502 bad gateway",
"500 internal server error",
"504 gateway time",
# 4xx
"400 bad request",
"401 unauthorized",
"403 forbidden",
"404 not found",
"408 request time",
"419 page expired",
"429 too many requests",
# génériques
"temporarily busy",
"server error",
"internal error",
"page not found",
"request could not be satisfied",
]
# ====================== # ======================
# HELPERS # HELPERS
# ====================== # ======================
@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
if getattr(el, "name", None) == "p": if getattr(el, "name", None) == "p":
break break
def is_error_page(soup: BeautifulSoup) -> bool:
text = soup.get_text(" ", strip=True).lower()
return any(p in text for p in ERROR_PAGE_PATTERNS)
def build_fallback_html(title: str, filename: str) -> str:
safe_title = title or filename.replace("_", " ").replace(".html", "")
return f"""<html>
<head>
<meta charset="utf-8">
<title>{safe_title}</title>
</head>
<body>
<h1>{safe_title}</h1>
<p>Lost content (HTTracker) in {filename}</p>
</body>
</html>
"""
# ====================== # ======================
# CORE FUNCTIONS # CORE FUNCTIONS
# ====================== # ======================
def clean_html_file(input_path: Path, output_path: Path): def clean_html_file(input_path: Path, output_path: Path):
html = input_path.read_text(encoding="utf-8", errors="ignore") html_page = input_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html_page, "html.parser")
# Remove comments (HTTrack etc.) # Remove comments (HTTrack etc.)
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
for el in soup.find_all(tag): for el in soup.find_all(tag):
el.decompose() el.decompose()
# Extract main content # Extract main content
content = soup.select_one("#mw-content-text") content = soup.select_one("#mw-content-text")
if not content: if not content or is_error_page(soup):
print(f"[WARN] No content in {input_path.name}") print(f"[WARN] No content in {input_path.name}")
fallback = build_fallback_html(
title=soup.title.get_text(strip=True) if soup.title else "",
filename=input_path.name
)
output_path.write_text(fallback, encoding="utf-8")
return return
remove_intro_rule_box(content) remove_intro_rule_box(content)
@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):
def process_all(): def process_all():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
files = list(SOURCE_DIR.glob("*.html"))
for file in SOURCE_DIR.glob("*.html"): total = len(files)
print(f"{total} fichiers trouvés")
for i, file in enumerate(files, start=1):
output_file = OUTPUT_DIR / file.name output_file = OUTPUT_DIR / file.name
clean_html_file(file, output_file) clean_html_file(file, output_file)
if i % 200 == 0 or i == total:
print(f"{i}/{total} analysés ({i/total:.1%})")
print("✅ Cleaning complete") print("✅ Cleaning complete")

View file

@ -3,6 +3,7 @@ import json
import re import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote from urllib.parse import urlparse, parse_qs, unquote
import unicodedata
# -------------------------------------------------- # --------------------------------------------------
# PATHS # PATHS
@ -29,16 +30,14 @@ valid_targets = set(canonical_pages.values())
# HELPERS # HELPERS
# -------------------------------------------------- # --------------------------------------------------
def normalize_title(title: str | None): def normalize_title(title: str) -> str:
if not title: title = title.strip()
return None title = unicodedata.normalize("NFKC", title)
title = unquote(title)
title = title.replace("_", " ") title = title.replace("_", " ")
title = re.sub(r"\s+", " ", title.strip()) title = title.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
title = re.sub(r"\s+", " ", title)
return title.casefold() return title.casefold()
# ------------------------- # -------------------------
# Extract MediaWiki target # Extract MediaWiki target
# ------------------------- # -------------------------