whu_migration_scripts/scan_internal_links.py

from pathlib import Path
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse

INPUT_DIR = Path("../unique_pages")
REGISTRY_DIR = Path("../link_registry")

title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))

OUTPUT_RESOLVED = []
OUTPUT_UNRESOLVED = []

# ======================
# HELPERS
# ======================

def normalize_href(href: str):
    if not href:
        return None

    # ignore external links
    if href.startswith("http"):
        return None

    name = Path(href).stem
    return name.lower()


def resolve(name):
    if name in title_registry:
        return name

    if name in alias_registry:
        return alias_registry[name]

    # try removing category prefix
    if name.startswith("category_"):
        alt = name.replace("category_", "", 1)
        if alt in title_registry:
            return alt

    return None


def extract_article_links(soup):

    content = soup.find("div", id="mw-content-text")
    if not content:
        return []

    links = []

    for a in content.find_all("a", href=True):

        href = a["href"]

        # ignore anchors
        if href.startswith("#"):
            continue

        # ignore files/images/history/etc
        if any(prefix in href.lower() for prefix in [
            "file_",
            "image:",
            "special:",
            "action=",
        ]):
            continue

        links.append(href)

    return links


# ======================
# MAIN
# ======================

files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")

for i, file_path in enumerate(files, 1):

    html = file_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    links = extract_article_links(soup)

    for href in links:

        key = normalize_href(href)
        if not key:
            continue

        resolved = resolve(key)

        entry = {
            "source": file_path.name,
            "link": href,
        }

        if resolved:
            entry["target"] = resolved
            OUTPUT_RESOLVED.append(entry)
        else:
            OUTPUT_UNRESOLVED.append(entry)

    if i % 100 == 0:
        print(f"{i}/{len(files)} analysées")

# ======================
# SAVE
# ======================

json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)

print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(OUTPUT_RESOLVED))
print("Unresolved:", len(OUTPUT_UNRESOLVED))