whu_migration_scripts/scan_internal_links.py

from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote

# --------------------------------------------------
# PATHS
# --------------------------------------------------

PAGES_DIR = Path("../output/pages")
REGISTRY_PATH = Path("../output/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan")

OUTPUT_DIR.mkdir(exist_ok=True)

# --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------

registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))

equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]

valid_targets = set(canonical_pages.values())

# --------------------------------------------------
# HELPERS
# --------------------------------------------------

def normalize_title(title: str | None):
    if not title:
        return None

    title = unquote(title)
    title = title.replace("_", " ")
    title = re.sub(r"\s+", " ", title.strip())
    return title.casefold()


# -------------------------
# Extract MediaWiki target
# -------------------------

def extract_mediawiki_target(href: str):

    if not href:
        return None

    # ignore anchors
    if href.startswith("#"):
        return None

    parsed = urlparse(href)

    # external link
    if parsed.scheme in ("http", "https"):
        return None

    path = parsed.path or ""

    # /wiki/Page_Name
    if "/wiki/" in path:
        return path.split("/wiki/", 1)[1]

    # index.php?title=Page
    if "index.php" in path:
        qs = parse_qs(parsed.query)
        if "title" in qs:
            return qs["title"][0]

    # fallback filename-like
    return Path(path).stem


# -------------------------
# Ignore unwanted namespaces
# -------------------------

IGNORED_PREFIXES = (
    "file:",
    "image:",
    "template:",
    "special:",
    "help:",
    "user:",
    "talk:",
)

def is_ignored_namespace(title_norm: str):
    return title_norm.startswith(IGNORED_PREFIXES)


# -------------------------
# Extract article content
# -------------------------

def extract_article_links(soup):

    content = soup.find("div", id="mw-content-text")
    if not content:
        return []

    links = []

    for a in content.select("a[href]"):

        # ignore navboxes / metadata
        if a.find_parent(class_="navbox"):
            continue

        href = a.get("href")
        links.append(href)

    return links


# --------------------------------------------------
# MAIN SCAN
# --------------------------------------------------

resolved_links = []
unresolved_links = []

files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")

for i, file_path in enumerate(files, 1):

    html = file_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    links = extract_article_links(soup)

    for href in links:

        raw_target = extract_mediawiki_target(href)
        norm = normalize_title(raw_target)

        if not norm:
            continue

        if is_ignored_namespace(norm):
            continue

        entry = {
            "source": file_path.name,
            "href": href,
            "normalized": norm,
        }

        resolved = equivalences.get(norm)

        if resolved:
            entry["resolved_title"] = resolved
            resolved_links.append(entry)
        else:
            unresolved_links.append(entry)

    if i % 100 == 0:
        print(f"{i}/{len(files)} analysées")

# --------------------------------------------------
# SAVE RESULTS
# --------------------------------------------------

json.dump(
    resolved_links,
    open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
)

json.dump(
    unresolved_links,
    open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
)

print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(resolved_links))
print("Unresolved:", len(unresolved_links))
tweak paths and find links 2026-04-03 15:50:40 +02:00			`from pathlib import Path`
			`import json`
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`import re`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`from bs4 import BeautifulSoup`
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`from urllib.parse import urlparse, parse_qs, unquote`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# --------------------------------------------------`
			`# PATHS`
			`# --------------------------------------------------`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`PAGES_DIR = Path("../output/pages")`
			`REGISTRY_PATH = Path("../output/equivalence_registry.json")`
			`OUTPUT_DIR = Path("../output/link_scan")`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`OUTPUT_DIR.mkdir(exist_ok=True)`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# --------------------------------------------------`
			`# LOAD REGISTRY`
			`# --------------------------------------------------`

			`registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))`

			`equivalences = registry["equivalences"]`
			`canonical_pages = registry["canonical_pages"]`

			`valid_targets = set(canonical_pages.values())`

			`# --------------------------------------------------`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`# HELPERS`
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# --------------------------------------------------`

			`def normalize_title(title: str \| None):`
			`if not title:`
			`return None`

			`title = unquote(title)`
			`title = title.replace("_", " ")`
			`title = re.sub(r"\s+", " ", title.strip())`
			`return title.casefold()`


			`# -------------------------`
			`# Extract MediaWiki target`
			`# -------------------------`

			`def extract_mediawiki_target(href: str):`
tweak paths and find links 2026-04-03 15:50:40 +02:00
			`if not href:`
			`return None`

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# ignore anchors`
			`if href.startswith("#"):`
			`return None`

			`parsed = urlparse(href)`

			`# external link`
			`if parsed.scheme in ("http", "https"):`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`return None`

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`path = parsed.path or ""`

			`# /wiki/Page_Name`
			`if "/wiki/" in path:`
			`return path.split("/wiki/", 1)[1]`

			`# index.php?title=Page`
			`if "index.php" in path:`
			`qs = parse_qs(parsed.query)`
			`if "title" in qs:`
			`return qs["title"][0]`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# fallback filename-like`
			`return Path(path).stem`
tweak paths and find links 2026-04-03 15:50:40 +02:00

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# -------------------------`
			`# Ignore unwanted namespaces`
			`# -------------------------`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`IGNORED_PREFIXES = (`
			`"file:",`
			`"image:",`
			`"template:",`
			`"special:",`
			`"help:",`
			`"user:",`
			`"talk:",`
			`)`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`def is_ignored_namespace(title_norm: str):`
			`return title_norm.startswith(IGNORED_PREFIXES)`
tweak paths and find links 2026-04-03 15:50:40 +02:00

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# -------------------------`
			`# Extract article content`
			`# -------------------------`

tweak paths and find links 2026-04-03 15:50:40 +02:00			`def extract_article_links(soup):`

			`content = soup.find("div", id="mw-content-text")`
			`if not content:`
			`return []`

			`links = []`

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`for a in content.select("a[href]"):`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# ignore navboxes / metadata`
			`if a.find_parent(class_="navbox"):`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`continue`

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`href = a.get("href")`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`links.append(href)`

			`return links`


fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# --------------------------------------------------`
			`# MAIN SCAN`
			`# --------------------------------------------------`

			`resolved_links = []`
			`unresolved_links = []`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`files = list(PAGES_DIR.glob("*.html"))`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`print(f"{len(files)} pages à analyser")`

			`for i, file_path in enumerate(files, 1):`

			`html = file_path.read_text(encoding="utf-8", errors="ignore")`
			`soup = BeautifulSoup(html, "html.parser")`

			`links = extract_article_links(soup)`

			`for href in links:`

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`raw_target = extract_mediawiki_target(href)`
			`norm = normalize_title(raw_target)`

			`if not norm:`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`continue`

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`if is_ignored_namespace(norm):`
			`continue`
tweak paths and find links 2026-04-03 15:50:40 +02:00
			`entry = {`
			`"source": file_path.name,`
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`"href": href,`
			`"normalized": norm,`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`}`

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`resolved = equivalences.get(norm)`

tweak paths and find links 2026-04-03 15:50:40 +02:00			`if resolved:`
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`entry["resolved_title"] = resolved`
			`resolved_links.append(entry)`
tweak paths and find links 2026-04-03 15:50:40 +02:00			`else:`
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`unresolved_links.append(entry)`
tweak paths and find links 2026-04-03 15:50:40 +02:00
			`if i % 100 == 0:`
			`print(f"{i}/{len(files)} analysées")`

fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`# --------------------------------------------------`
			`# SAVE RESULTS`
			`# --------------------------------------------------`

			`json.dump(`
			`resolved_links,`
			`open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),`
			`indent=2,`
			`ensure_ascii=False,`
			`)`
tweak paths and find links 2026-04-03 15:50:40 +02:00
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`json.dump(`
			`unresolved_links,`
			`open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),`
			`indent=2,`
			`ensure_ascii=False,`
			`)`
tweak paths and find links 2026-04-03 15:50:40 +02:00
			`print("\n✅ LINK SCAN COMPLETE")`
fix maaping canonical preference 2026-04-07 15:06:30 +02:00			`print("Resolved:", len(resolved_links))`
			`print("Unresolved:", len(unresolved_links))`