keep error pages with fallback content
This commit is contained in:
parent
4e473ba2c9
commit
8e9289998b
2 changed files with 63 additions and 14 deletions
|
|
@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
|
|||
"border",
|
||||
]
|
||||
|
||||
ERROR_PAGE_PATTERNS = [
|
||||
# 5xx
|
||||
"503 service unavailable",
|
||||
"502 bad gateway",
|
||||
"500 internal server error",
|
||||
"504 gateway time",
|
||||
|
||||
# 4xx
|
||||
"400 bad request",
|
||||
"401 unauthorized",
|
||||
"403 forbidden",
|
||||
"404 not found",
|
||||
"408 request time",
|
||||
"419 page expired",
|
||||
"429 too many requests",
|
||||
|
||||
# génériques
|
||||
"temporarily busy",
|
||||
"server error",
|
||||
"internal error",
|
||||
"page not found",
|
||||
"request could not be satisfied",
|
||||
]
|
||||
|
||||
# ======================
|
||||
# HELPERS
|
||||
# ======================
|
||||
|
|
@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
|
|||
if getattr(el, "name", None) == "p":
|
||||
break
|
||||
|
||||
def is_error_page(soup: BeautifulSoup) -> bool:
|
||||
text = soup.get_text(" ", strip=True).lower()
|
||||
return any(p in text for p in ERROR_PAGE_PATTERNS)
|
||||
|
||||
def build_fallback_html(title: str, filename: str) -> str:
|
||||
safe_title = title or filename.replace("_", " ").replace(".html", "")
|
||||
|
||||
return f"""<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{safe_title}</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>{safe_title}</h1>
|
||||
<p>Lost content (HTTracker) in {filename}</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# ======================
|
||||
# CORE FUNCTIONS
|
||||
# ======================
|
||||
|
||||
def clean_html_file(input_path: Path, output_path: Path):
|
||||
html = input_path.read_text(encoding="utf-8", errors="ignore")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
html_page = input_path.read_text(encoding="utf-8", errors="ignore")
|
||||
soup = BeautifulSoup(html_page, "html.parser")
|
||||
|
||||
# Remove comments (HTTrack etc.)
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
|
|
@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
|
|||
for el in soup.find_all(tag):
|
||||
el.decompose()
|
||||
|
||||
|
||||
# Extract main content
|
||||
content = soup.select_one("#mw-content-text")
|
||||
if not content:
|
||||
if not content or is_error_page(soup):
|
||||
print(f"[WARN] No content in {input_path.name}")
|
||||
fallback = build_fallback_html(
|
||||
title=soup.title.get_text(strip=True) if soup.title else "",
|
||||
filename=input_path.name
|
||||
)
|
||||
output_path.write_text(fallback, encoding="utf-8")
|
||||
return
|
||||
remove_intro_rule_box(content)
|
||||
|
||||
|
|
@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):
|
|||
|
||||
def process_all():
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for file in SOURCE_DIR.glob("*.html"):
|
||||
files = list(SOURCE_DIR.glob("*.html"))
|
||||
total = len(files)
|
||||
print(f"{total} fichiers trouvés")
|
||||
for i, file in enumerate(files, start=1):
|
||||
output_file = OUTPUT_DIR / file.name
|
||||
clean_html_file(file, output_file)
|
||||
|
||||
if i % 200 == 0 or i == total:
|
||||
print(f"{i}/{total} analysés ({i/total:.1%})")
|
||||
print("✅ Cleaning complete")
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue