whu_migration_scripts/scan_internal_links.py
Maxime Réaux 186492de85 WIP
2026-04-16 10:04:58 +02:00

161 lines
No EOL
4.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote
import unicodedata
# --------------------------------------------------
# CONFIG
# --------------------------------------------------
PAGES_DIR = Path("../output_ok/cleaned_pages")
REGISTRY_PATH = Path("../output_ok/equivalence_registry.json")
OUTPUT_DIR = Path("../output_ok/link_scan")
OUTPUT_DIR.mkdir(exist_ok=True)
IGNORED_PREFIXES = (
"file ",
"image ",
"category ",
"template ",
"special ",
"help ",
"user ",
"talk ",
)
# --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]
valid_targets = set(canonical_pages.values())
# --------------------------------------------------
# HELPERS
# --------------------------------------------------
def normalize_title(title: str) -> str:
if not title:
return
title = title.strip()
title = unquote(title)
title = Path(title).stem
title = unicodedata.normalize("NFKC", title)
title = title.replace("_", " ")
title = title.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
title = re.sub(r"\s+", " ", title)
return title.casefold()
def extract_mediawiki_target(href: str):
if not href:
return None
if href.startswith("#"):
return None
parsed = urlparse(href)
if parsed.scheme in ("http", "https"):
return None
path = parsed.path or ""
if "/wiki/" in path:
return path.split("/wiki/", 1)[1]
if "index.php" in path:
qs = parse_qs(parsed.query)
if "title" in qs:
return qs["title"][0]
return Path(path).stem
def is_ignored_namespace(title_norm: str):
return title_norm.startswith(IGNORED_PREFIXES)
def extract_article_links(soup):
content = soup.find("div", id="mw-content-text")
if not content:
return []
links = []
for a in content.select("a[href]"):
if a.find_parent(class_="navbox"):
continue
links.append({
"href": a.get("href"),
"title": a.get("title"),
"text": a.get_text(strip=True),
})
return links
def resolve_link(raw_target, title_attr):
candidates = []
if title_attr:
candidates.append(title_attr)
if raw_target:
candidates.append(raw_target)
for candidate in candidates:
norm = normalize_title(candidate)
if not norm:
continue
if is_ignored_namespace(norm):
return None, "ignored"
if norm in equivalences:
return equivalences[norm], "equivalence"
filename = norm.replace(" ", "_") + ".html"
if filename in valid_targets:
return filename, "direct"
return None, "unresolved"
# --------------------------------------------------
# MAIN SCAN
# --------------------------------------------------
resolved_links = []
unresolved_links = []
files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")
for i, file_path in enumerate(files, 1):
html = file_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser")
links = extract_article_links(soup)
for link in links:
raw_target = extract_mediawiki_target(link["href"])
resolved, method = resolve_link(raw_target, link["title"])
entry = {
"source": file_path.name,
"href": link["href"],
"title": link["title"],
"method": method,
}
if resolved:
entry["resolved"] = resolved
resolved_links.append(entry)
else:
entry["raw_target"] = raw_target
unresolved_links.append(entry)
if i % 200 == 0:
print(f"{i}/{len(files)} analysées")
# --------------------------------------------------
# SAVE RESULTS
# --------------------------------------------------
json.dump(
resolved_links,
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
json.dump(
unresolved_links,
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(resolved_links))
print("Unresolved:", len(unresolved_links))