whu_migration_scripts/build_link_registry.py

177 lines
4.2 KiB
Python
Raw Permalink Normal View History

2026-04-03 15:50:40 +02:00
from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
# ======================
# CONFIG
# ======================
INPUT_DIR = Path("../unique_pages")
OUTPUT_DIR = Path("../link_registry")
OUTPUT_DIR.mkdir(exist_ok=True)
# MediaWiki-like prefixes
PREFIXES = [
"category:",
"category_",
"file:",
"template:",
]
# ======================
# HELPERS
# ======================
def normalize(title: str) -> str:
"""Canonical key normalization."""
if not title:
return ""
title = title.strip()
title = title.replace("_", " ")
title = re.sub(r"\s+", " ", title)
return title.lower()
def slugify(title: str) -> str:
return normalize(title).replace(" ", "_")
def strip_prefix(title: str) -> str:
t = title.lower()
for p in PREFIXES:
if t.startswith(p):
return title[len(p):]
return title
def extract_title(soup: BeautifulSoup):
"""Try multiple strategies to extract page title."""
# Strategy 1: MediaWiki heading
h1 = soup.find("h1", id="firstHeading")
if h1:
return h1.get_text(strip=True)
# Strategy 2: HTML title
if soup.title:
title = soup.title.get_text()
if "-" in title:
return title.split("-")[0].strip()
return title.strip()
return None
def detect_redirect(soup: BeautifulSoup):
"""Detect MediaWiki redirect pages."""
text = soup.get_text(" ", strip=True).lower()
# HTTrack redirects often contain this
if "#redirect" in text:
link = soup.find("a")
if link and link.get("href"):
return link["href"]
# alternative pattern
redirect_note = soup.find(class_="redirectText")
if redirect_note:
link = redirect_note.find("a")
if link:
return link.get("href")
return None
# ======================
# MAIN
# ======================
title_registry = {}
alias_registry = {}
redirects = {}
unresolved = []
files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} fichiers trouvés")
for i, file_path in enumerate(files, 1):
try:
html = file_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser")
title = extract_title(soup)
if not title:
unresolved.append({
"file": file_path.name,
"reason": "no_title_found"
})
continue
key = slugify(title)
# register canonical page
title_registry[key] = {
"title": title,
"file": str(file_path)
}
# detect redirect
redirect_href = detect_redirect(soup)
if redirect_href:
target = Path(redirect_href).stem
target_key = slugify(target)
redirects[key] = target_key
alias_registry[key] = target_key
except Exception as e:
unresolved.append({
"file": file_path.name,
"reason": str(e)
})
if i % 100 == 0:
print(f"{i}/{len(files)} traités")
# ======================
# AUTO ALIAS GENERATION
# ======================
auto_alias_count = 0
for key in list(title_registry.keys()):
stripped = slugify(strip_prefix(key))
if stripped != key and stripped in title_registry:
alias_registry[key] = stripped
auto_alias_count += 1
print(f"Alias automatiques ajoutés: {auto_alias_count}")
# ======================
# SAVE FILES
# ======================
with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f:
json.dump(title_registry, f, indent=2, ensure_ascii=False)
with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f:
json.dump(alias_registry, f, indent=2, ensure_ascii=False)
with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f:
json.dump(redirects, f, indent=2, ensure_ascii=False)
with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f:
json.dump(unresolved, f, indent=2, ensure_ascii=False)
print("\n✅ REGISTRY BUILD COMPLETE")
print(f"Pages uniques: {len(title_registry)}")
print(f"Alias: {len(alias_registry)}")
print(f"Redirects: {len(redirects)}")
print(f"Problèmes: {len(unresolved)}")