whu_migration_scripts/scan_internal_links.py

from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote
import unicodedata

# --------------------------------------------------
# PATHS
# --------------------------------------------------

PAGES_DIR = Path("../output/pages")
REGISTRY_PATH = Path("../output/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan")

OUTPUT_DIR.mkdir(exist_ok=True)

# --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------

registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))

equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]

valid_targets = set(canonical_pages.values())

# --------------------------------------------------
# HELPERS
# --------------------------------------------------

def normalize_title(title: str) -> str:
    title = title.strip()
    title = unicodedata.normalize("NFKC", title)
    title = title.replace("_", " ")
    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    title = re.sub(r"\s+", " ", title)
    return title.casefold()

# -------------------------
# Extract MediaWiki target
# -------------------------

def extract_mediawiki_target(href: str):

    if not href:
        return None

    # ignore anchors
    if href.startswith("#"):
        return None

    parsed = urlparse(href)

    # external link
    if parsed.scheme in ("http", "https"):
        return None

    path = parsed.path or ""

    # /wiki/Page_Name
    if "/wiki/" in path:
        return path.split("/wiki/", 1)[1]

    # index.php?title=Page
    if "index.php" in path:
        qs = parse_qs(parsed.query)
        if "title" in qs:
            return qs["title"][0]

    # fallback filename-like
    return Path(path).stem


# -------------------------
# Ignore unwanted namespaces
# -------------------------

IGNORED_PREFIXES = (
    "file:",
    "image:",
    "template:",
    "special:",
    "help:",
    "user:",
    "talk:",
)

def is_ignored_namespace(title_norm: str):
    return title_norm.startswith(IGNORED_PREFIXES)


# -------------------------
# Extract article content
# -------------------------

def extract_article_links(soup):

    content = soup.find("div", id="mw-content-text")
    if not content:
        return []

    links = []

    for a in content.select("a[href]"):

        # ignore navboxes / metadata
        if a.find_parent(class_="navbox"):
            continue

        href = a.get("href")
        links.append(href)

    return links


# --------------------------------------------------
# MAIN SCAN
# --------------------------------------------------

resolved_links = []
unresolved_links = []

files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")

for i, file_path in enumerate(files, 1):

    html = file_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    links = extract_article_links(soup)

    for href in links:

        raw_target = extract_mediawiki_target(href)
        norm = normalize_title(raw_target)

        if not norm:
            continue

        if is_ignored_namespace(norm):
            continue

        entry = {
            "source": file_path.name,
            "href": href,
            "normalized": norm,
        }

        resolved = equivalences.get(norm)

        if resolved:
            entry["resolved_title"] = resolved
            resolved_links.append(entry)
        else:
            unresolved_links.append(entry)

    if i % 100 == 0:
        print(f"{i}/{len(files)} analysées")

# --------------------------------------------------
# SAVE RESULTS
# --------------------------------------------------

json.dump(
    resolved_links,
    open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
)

json.dump(
    unresolved_links,
    open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
)

print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(resolved_links))
print("Unresolved:", len(unresolved_links))
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								from pathlib import Path
 								import json
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								import re
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								from bs4 import BeautifulSoup
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								from urllib.parse import urlparse, parse_qs, unquote
-												keep error pages with fallback content

											
										
										
											2026-04-15 10:36:21 +02:00
+								import unicodedata
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
 								# PATHS
 								# --------------------------------------------------
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								PAGES_DIR = Path("../output/pages")
 								REGISTRY_PATH = Path("../output/equivalence_registry.json")
 								OUTPUT_DIR = Path("../output/link_scan")
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								OUTPUT_DIR.mkdir(exist_ok=True)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
 								# LOAD REGISTRY
 								# --------------------------------------------------
 								registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
 								equivalences = registry["equivalences"]
 								canonical_pages = registry["canonical_pages"]
 								valid_targets = set(canonical_pages.values())
 								# --------------------------------------------------
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								# HELPERS
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
-												keep error pages with fallback content

											
										
										
											2026-04-15 10:36:21 +02:00
+								def normalize_title(title: str) -> str:
 								    title = title.strip()
 								    title = unicodedata.normalize("NFKC", title)
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    title = title.replace("_", " ")
-												keep error pages with fallback content

											
										
										
											2026-04-15 10:36:21 +02:00
+								    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
 								    title = re.sub(r"\s+", " ", title)
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    return title.casefold()
 								# -------------------------
 								# Extract MediaWiki target
 								# -------------------------
 								def extract_mediawiki_target(href: str):
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
 								    if not href:
 								        return None
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    # ignore anchors
 								    if href.startswith("#"):
 								        return None
 								    parsed = urlparse(href)
 								    # external link
 								    if parsed.scheme in ("http", "https"):
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        return None
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    path = parsed.path or ""
 								    # /wiki/Page_Name
 								    if "/wiki/" in path:
 								        return path.split("/wiki/", 1)[1]
 								    # index.php?title=Page
 								    if "index.php" in path:
 								        qs = parse_qs(parsed.query)
 								        if "title" in qs:
 								            return qs["title"][0]
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    # fallback filename-like
 								    return Path(path).stem
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# -------------------------
 								# Ignore unwanted namespaces
 								# -------------------------
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								IGNORED_PREFIXES = (
 								    "file:",
 								    "image:",
 								    "template:",
 								    "special:",
 								    "help:",
 								    "user:",
 								    "talk:",
 								)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								def is_ignored_namespace(title_norm: str):
 								    return title_norm.startswith(IGNORED_PREFIXES)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# -------------------------
 								# Extract article content
 								# -------------------------
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								def extract_article_links(soup):
 								    content = soup.find("div", id="mw-content-text")
 								    if not content:
 								        return []
 								    links = []
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    for a in content.select("a[href]"):
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								        # ignore navboxes / metadata
 								        if a.find_parent(class_="navbox"):
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								            continue
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								        href = a.get("href")
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        links.append(href)
 								    return links
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
 								# MAIN SCAN
 								# --------------------------------------------------
 								resolved_links = []
 								unresolved_links = []
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								files = list(PAGES_DIR.glob("*.html"))
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								print(f"{len(files)} pages à analyser")
 								for i, file_path in enumerate(files, 1):
 								    html = file_path.read_text(encoding="utf-8", errors="ignore")
 								    soup = BeautifulSoup(html, "html.parser")
 								    links = extract_article_links(soup)
 								    for href in links:
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								        raw_target = extract_mediawiki_target(href)
 								        norm = normalize_title(raw_target)
 								        if not norm:
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								            continue
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								        if is_ignored_namespace(norm):
 								            continue
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
 								        entry = {
 								            "source": file_path.name,
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								            "href": href,
 								            "normalized": norm,
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        }
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								        resolved = equivalences.get(norm)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        if resolved:
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								            entry["resolved_title"] = resolved
 								            resolved_links.append(entry)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        else:
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								            unresolved_links.append(entry)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
 								    if i % 100 == 0:
 								        print(f"{i}/{len(files)} analysées")
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
 								# SAVE RESULTS
 								# --------------------------------------------------
 								json.dump(
 								    resolved_links,
 								    open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
 								    indent=2,
 								    ensure_ascii=False,
 								)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								json.dump(
 								    unresolved_links,
 								    open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
 								    indent=2,
 								    ensure_ascii=False,
 								)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
 								print("\n✅ LINK SCAN COMPLETE")
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								print("Resolved:", len(resolved_links))
 								print("Unresolved:", len(unresolved_links))