whu_migration_scripts/scan_internal_links.py

from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote
import unicodedata

# --------------------------------------------------
# PATHS
# --------------------------------------------------

PAGES_DIR = Path("../output/pages")
REGISTRY_PATH = Path("../output/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan")

OUTPUT_DIR.mkdir(exist_ok=True)

# --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------

registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))

equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]

valid_targets = set(canonical_pages.values())

# --------------------------------------------------
# HELPERS
# --------------------------------------------------

def normalize_title(title: str) -> str:
    title = title.strip()
    title = unicodedata.normalize("NFKC", title)
    title = title.replace("_", " ")
    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    title = re.sub(r"\s+", " ", title)
    return title.casefold()

# -------------------------
# Extract MediaWiki target
# -------------------------

def extract_mediawiki_target(href: str):

    if not href:
        return None

    # ignore anchors
    if href.startswith("#"):
        return None

    parsed = urlparse(href)

    # external link
    if parsed.scheme in ("http", "https"):
        return None

    path = parsed.path or ""

    # /wiki/Page_Name
    if "/wiki/" in path:
        return path.split("/wiki/", 1)[1]

    # index.php?title=Page
    if "index.php" in path:
        qs = parse_qs(parsed.query)
        if "title" in qs:
            return qs["title"][0]

    # fallback filename-like
    return Path(path).stem


# -------------------------
# Ignore unwanted namespaces
# -------------------------

IGNORED_PREFIXES = (
    "file:",
    "image:",
    "template:",
    "special:",
    "help:",
    "user:",
    "talk:",
)

def is_ignored_namespace(title_norm: str):
    return title_norm.startswith(IGNORED_PREFIXES)


# -------------------------
# Extract article content
# -------------------------

def extract_article_links(soup):

    content = soup.find("div", id="mw-content-text")
    if not content:
        return []

    links = []

    for a in content.select("a[href]"):

        # ignore navboxes / metadata
        if a.find_parent(class_="navbox"):
            continue

        href = a.get("href")
        links.append(href)

    return links


# --------------------------------------------------
# MAIN SCAN
# --------------------------------------------------

resolved_links = []
unresolved_links = []

files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")

for i, file_path in enumerate(files, 1):

    html = file_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    links = extract_article_links(soup)

    for href in links:

        raw_target = extract_mediawiki_target(href)
        norm = normalize_title(raw_target)

        if not norm:
            continue

        if is_ignored_namespace(norm):
            continue

        entry = {
            "source": file_path.name,
            "href": href,
            "normalized": norm,
        }

        resolved = equivalences.get(norm)

        if resolved:
            entry["resolved_title"] = resolved
            resolved_links.append(entry)
        else:
            unresolved_links.append(entry)

    if i % 100 == 0:
        print(f"{i}/{len(files)} analysées")

# --------------------------------------------------
# SAVE RESULTS
# --------------------------------------------------

json.dump(
    resolved_links,
    open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
)

json.dump(
    unresolved_links,
    open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
)

print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(resolved_links))
print("Unresolved:", len(unresolved_links))