whu_migration_scripts/scan_internal_links.py
2026-04-15 12:10:32 +02:00

185 lines
No EOL
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote
import unicodedata
# --------------------------------------------------
# PATHS
# --------------------------------------------------
PAGES_DIR = Path("../output/cleaned_pages")
REGISTRY_PATH = Path("../output/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan")
OUTPUT_DIR.mkdir(exist_ok=True)
# --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]
valid_targets = set(canonical_pages.values())
# --------------------------------------------------
# HELPERS
# --------------------------------------------------
def normalize_title(title: str) -> str:
if not title:
return
title = title.strip()
title = unicodedata.normalize("NFKC", title)
title = title.replace("_", " ")
title = title.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
title = re.sub(r"\s+", " ", title)
return title.casefold()
# -------------------------
# Extract MediaWiki target
# -------------------------
def extract_mediawiki_target(href: str):
if not href:
return None
# ignore anchors
if href.startswith("#"):
return None
parsed = urlparse(href)
# external link
if parsed.scheme in ("http", "https"):
return None
path = parsed.path or ""
# /wiki/Page_Name
if "/wiki/" in path:
return path.split("/wiki/", 1)[1]
# index.php?title=Page
if "index.php" in path:
qs = parse_qs(parsed.query)
if "title" in qs:
return qs["title"][0]
# fallback filename-like
return Path(path).stem
# -------------------------
# Ignore unwanted namespaces
# -------------------------
IGNORED_PREFIXES = (
"file:",
"image:",
"template:",
"special:",
"help:",
"user:",
"talk:",
)
def is_ignored_namespace(title_norm: str):
return title_norm.startswith(IGNORED_PREFIXES)
# -------------------------
# Extract article content
# -------------------------
def extract_article_links(soup):
content = soup.find("div", id="mw-content-text")
if not content:
return []
links = []
for a in content.select("a[href]"):
# ignore navboxes / metadata
if a.find_parent(class_="navbox"):
continue
href = a.get("href")
links.append(href)
return links
# --------------------------------------------------
# MAIN SCAN
# --------------------------------------------------
resolved_links = []
unresolved_links = []
files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")
for i, file_path in enumerate(files, 1):
html = file_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser")
links = extract_article_links(soup)
for href in links:
raw_target = extract_mediawiki_target(href)
norm = normalize_title(raw_target)
if not norm:
continue
if is_ignored_namespace(norm):
continue
entry = {
"source": file_path.name,
"href": href,
"normalized": norm,
}
resolved = equivalences.get(norm)
if resolved:
entry["resolved_title"] = resolved
resolved_links.append(entry)
else:
unresolved_links.append(entry)
if i % 100 == 0:
print(f"{i}/{len(files)} analysées")
# --------------------------------------------------
# SAVE RESULTS
# --------------------------------------------------
json.dump(
resolved_links,
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
json.dump(
unresolved_links,
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(resolved_links))
print("Unresolved:", len(unresolved_links))