diff --git a/extract_content.py b/extract_content.py index 18ceabc..7d0b1df 100644 --- a/extract_content.py +++ b/extract_content.py @@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [ "border", ] +ERROR_PAGE_PATTERNS = [ + # 5xx + "503 service unavailable", + "502 bad gateway", + "500 internal server error", + "504 gateway time", + + # 4xx + "400 bad request", + "401 unauthorized", + "403 forbidden", + "404 not found", + "408 request time", + "419 page expired", + "429 too many requests", + + # génériques + "temporarily busy", + "server error", + "internal error", + "page not found", + "request could not be satisfied", +] + # ====================== # HELPERS # ====================== @@ -238,13 +262,32 @@ def remove_intro_rule_box(content): if getattr(el, "name", None) == "p": break +def is_error_page(soup: BeautifulSoup) -> bool: + text = soup.get_text(" ", strip=True).lower() + return any(p in text for p in ERROR_PAGE_PATTERNS) + +def build_fallback_html(title: str, filename: str) -> str: + safe_title = title or filename.replace("_", " ").replace(".html", "") + + return f""" + + + {safe_title} + + +

{safe_title}

+

Lost content (HTTracker) in {filename}

+ + +""" + # ====================== # CORE FUNCTIONS # ====================== def clean_html_file(input_path: Path, output_path: Path): - html = input_path.read_text(encoding="utf-8", errors="ignore") - soup = BeautifulSoup(html, "html.parser") + html_page = input_path.read_text(encoding="utf-8", errors="ignore") + soup = BeautifulSoup(html_page, "html.parser") # Remove comments (HTTrack etc.) for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): @@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path): for el in soup.find_all(tag): el.decompose() - # Extract main content content = soup.select_one("#mw-content-text") - if not content: + if not content or is_error_page(soup): print(f"[WARN] No content in {input_path.name}") + fallback = build_fallback_html( + title=soup.title.get_text(strip=True) if soup.title else "", + filename=input_path.name + ) + output_path.write_text(fallback, encoding="utf-8") return remove_intro_rule_box(content) @@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path): def process_all(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - - for file in SOURCE_DIR.glob("*.html"): + files = list(SOURCE_DIR.glob("*.html")) + total = len(files) + print(f"{total} fichiers trouvés") + for i, file in enumerate(files, start=1): output_file = OUTPUT_DIR / file.name clean_html_file(file, output_file) - + if i % 200 == 0 or i == total: + print(f"{i}/{total} analysés ({i/total:.1%})") print("✅ Cleaning complete") diff --git a/scan_internal_links.py b/scan_internal_links.py index 0858a57..25fc6d7 100644 --- a/scan_internal_links.py +++ b/scan_internal_links.py @@ -3,6 +3,7 @@ import json import re from bs4 import BeautifulSoup from urllib.parse import urlparse, parse_qs, unquote +import unicodedata # -------------------------------------------------- # PATHS @@ -29,16 +30,14 @@ valid_targets = set(canonical_pages.values()) # HELPERS # -------------------------------------------------- -def normalize_title(title: str | None): - if not title: - return None - - title = unquote(title) +def normalize_title(title: str) -> str: + title = title.strip() + title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ") - title = re.sub(r"\s+", " ", title.strip()) + title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') + title = re.sub(r"\s+", " ", title) return title.casefold() - # ------------------------- # Extract MediaWiki target # -------------------------