keep error pages with fallback content
This commit is contained in:
parent
4e473ba2c9
commit
8e9289998b
2 changed files with 63 additions and 14 deletions
|
|
@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
|
||||||
"border",
|
"border",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ERROR_PAGE_PATTERNS = [
|
||||||
|
# 5xx
|
||||||
|
"503 service unavailable",
|
||||||
|
"502 bad gateway",
|
||||||
|
"500 internal server error",
|
||||||
|
"504 gateway time",
|
||||||
|
|
||||||
|
# 4xx
|
||||||
|
"400 bad request",
|
||||||
|
"401 unauthorized",
|
||||||
|
"403 forbidden",
|
||||||
|
"404 not found",
|
||||||
|
"408 request time",
|
||||||
|
"419 page expired",
|
||||||
|
"429 too many requests",
|
||||||
|
|
||||||
|
# génériques
|
||||||
|
"temporarily busy",
|
||||||
|
"server error",
|
||||||
|
"internal error",
|
||||||
|
"page not found",
|
||||||
|
"request could not be satisfied",
|
||||||
|
]
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
# HELPERS
|
# HELPERS
|
||||||
# ======================
|
# ======================
|
||||||
|
|
@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
|
||||||
if getattr(el, "name", None) == "p":
|
if getattr(el, "name", None) == "p":
|
||||||
break
|
break
|
||||||
|
|
||||||
|
def is_error_page(soup: BeautifulSoup) -> bool:
|
||||||
|
text = soup.get_text(" ", strip=True).lower()
|
||||||
|
return any(p in text for p in ERROR_PAGE_PATTERNS)
|
||||||
|
|
||||||
|
def build_fallback_html(title: str, filename: str) -> str:
|
||||||
|
safe_title = title or filename.replace("_", " ").replace(".html", "")
|
||||||
|
|
||||||
|
return f"""<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>{safe_title}</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>{safe_title}</h1>
|
||||||
|
<p>Lost content (HTTracker) in {filename}</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
# CORE FUNCTIONS
|
# CORE FUNCTIONS
|
||||||
# ======================
|
# ======================
|
||||||
|
|
||||||
def clean_html_file(input_path: Path, output_path: Path):
|
def clean_html_file(input_path: Path, output_path: Path):
|
||||||
html = input_path.read_text(encoding="utf-8", errors="ignore")
|
html_page = input_path.read_text(encoding="utf-8", errors="ignore")
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html_page, "html.parser")
|
||||||
|
|
||||||
# Remove comments (HTTrack etc.)
|
# Remove comments (HTTrack etc.)
|
||||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||||
|
|
@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
for el in soup.find_all(tag):
|
for el in soup.find_all(tag):
|
||||||
el.decompose()
|
el.decompose()
|
||||||
|
|
||||||
|
|
||||||
# Extract main content
|
# Extract main content
|
||||||
content = soup.select_one("#mw-content-text")
|
content = soup.select_one("#mw-content-text")
|
||||||
if not content:
|
if not content or is_error_page(soup):
|
||||||
print(f"[WARN] No content in {input_path.name}")
|
print(f"[WARN] No content in {input_path.name}")
|
||||||
|
fallback = build_fallback_html(
|
||||||
|
title=soup.title.get_text(strip=True) if soup.title else "",
|
||||||
|
filename=input_path.name
|
||||||
|
)
|
||||||
|
output_path.write_text(fallback, encoding="utf-8")
|
||||||
return
|
return
|
||||||
remove_intro_rule_box(content)
|
remove_intro_rule_box(content)
|
||||||
|
|
||||||
|
|
@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
|
|
||||||
def process_all():
|
def process_all():
|
||||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
files = list(SOURCE_DIR.glob("*.html"))
|
||||||
for file in SOURCE_DIR.glob("*.html"):
|
total = len(files)
|
||||||
|
print(f"{total} fichiers trouvés")
|
||||||
|
for i, file in enumerate(files, start=1):
|
||||||
output_file = OUTPUT_DIR / file.name
|
output_file = OUTPUT_DIR / file.name
|
||||||
clean_html_file(file, output_file)
|
clean_html_file(file, output_file)
|
||||||
|
if i % 200 == 0 or i == total:
|
||||||
|
print(f"{i}/{total} analysés ({i/total:.1%})")
|
||||||
print("✅ Cleaning complete")
|
print("✅ Cleaning complete")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import json
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse, parse_qs, unquote
|
from urllib.parse import urlparse, parse_qs, unquote
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PATHS
|
# PATHS
|
||||||
|
|
@ -29,16 +30,14 @@ valid_targets = set(canonical_pages.values())
|
||||||
# HELPERS
|
# HELPERS
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
def normalize_title(title: str | None):
|
def normalize_title(title: str) -> str:
|
||||||
if not title:
|
title = title.strip()
|
||||||
return None
|
title = unicodedata.normalize("NFKC", title)
|
||||||
|
|
||||||
title = unquote(title)
|
|
||||||
title = title.replace("_", " ")
|
title = title.replace("_", " ")
|
||||||
title = re.sub(r"\s+", " ", title.strip())
|
title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
||||||
|
title = re.sub(r"\s+", " ", title)
|
||||||
return title.casefold()
|
return title.casefold()
|
||||||
|
|
||||||
|
|
||||||
# -------------------------
|
# -------------------------
|
||||||
# Extract MediaWiki target
|
# Extract MediaWiki target
|
||||||
# -------------------------
|
# -------------------------
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue