whu_migration_scripts/scan_internal_links.py

from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote
import unicodedata

# --------------------------------------------------
# CONFIG
# --------------------------------------------------

PAGES_DIR = Path("../output_ok/cleaned_pages")
REGISTRY_PATH = Path("../output_ok/equivalence_registry.json")
OUTPUT_DIR = Path("../output_ok/link_scan")

OUTPUT_DIR.mkdir(exist_ok=True)
IGNORED_PREFIXES = (
    "file ",
    "image ",
    "category ",
    "template ",
    "special ",
    "help ",
    "user ",
    "talk ",
)

# --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------

registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))

equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]

valid_targets = set(canonical_pages.values())

# --------------------------------------------------
# HELPERS
# --------------------------------------------------

def normalize_title(title: str) -> str:
    if not title:
        return
    title = title.strip()
    title = unquote(title)
    title = Path(title).stem
    title = unicodedata.normalize("NFKC", title)
    title = title.replace("_", " ")
    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    title = re.sub(r"\s+", " ", title)
    return title.casefold()

def extract_mediawiki_target(href: str):
    if not href:
        return None
    if href.startswith("#"):
        return None
    parsed = urlparse(href)
    if parsed.scheme in ("http", "https"):
        return None
    path = parsed.path or ""
    if "/wiki/" in path:
        return path.split("/wiki/", 1)[1]
    if "index.php" in path:
        qs = parse_qs(parsed.query)
        if "title" in qs:
            return qs["title"][0]
    return Path(path).stem


def is_ignored_namespace(title_norm: str):
    return title_norm.startswith(IGNORED_PREFIXES)

def extract_article_links(soup):
    content = soup.find("div", id="mw-content-text")
    if not content:
        return []
    links = []
    for a in content.select("a[href]"):
        if a.find_parent(class_="navbox"):
            continue
        links.append({
            "href": a.get("href"),
            "title": a.get("title"),
            "text": a.get_text(strip=True),
        })
    return links

def resolve_link(raw_target, title_attr):
    candidates = []
    if title_attr:
        candidates.append(title_attr)
    if raw_target:
        candidates.append(raw_target)
    for candidate in candidates:
        norm = normalize_title(candidate)
        if not norm:
            continue
        if is_ignored_namespace(norm):
            return None, "ignored"
        if norm in equivalences:
            return equivalences[norm], "equivalence"
        filename = norm.replace(" ", "_") + ".html"
        if filename in valid_targets:
            return filename, "direct"
    return None, "unresolved"

# --------------------------------------------------
# MAIN SCAN
# --------------------------------------------------

resolved_links = []
unresolved_links = []
files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")

for i, file_path in enumerate(files, 1):
    html = file_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")
    links = extract_article_links(soup)
    for link in links:
        raw_target = extract_mediawiki_target(link["href"])
        resolved, method = resolve_link(raw_target, link["title"])
        entry = {
            "source": file_path.name,
            "href": link["href"],
            "title": link["title"],
            "method": method,
        }
        if resolved:
            entry["resolved"] = resolved
            resolved_links.append(entry)
        else:
            entry["raw_target"] = raw_target
            unresolved_links.append(entry)
    if i % 200 == 0:
        print(f"{i}/{len(files)} analysées")

# --------------------------------------------------
# SAVE RESULTS
# --------------------------------------------------

json.dump(
    resolved_links,
    open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
)

json.dump(
    unresolved_links,
    open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
)

print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(resolved_links))
print("Unresolved:", len(unresolved_links))
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								from pathlib import Path
 								import json
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								import re
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								from bs4 import BeautifulSoup
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								from urllib.parse import urlparse, parse_qs, unquote
-												keep error pages with fallback content

											
										
										
											2026-04-15 10:36:21 +02:00
+								import unicodedata
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								# CONFIG
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								PAGES_DIR = Path("../output_ok/cleaned_pages")
 								REGISTRY_PATH = Path("../output_ok/equivalence_registry.json")
 								OUTPUT_DIR = Path("../output_ok/link_scan")
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								OUTPUT_DIR.mkdir(exist_ok=True)
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								IGNORED_PREFIXES = (
 								    "file ",
 								    "image ",
 								    "category ",
 								    "template ",
 								    "special ",
 								    "help ",
 								    "user ",
 								    "talk ",
 								)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
 								# LOAD REGISTRY
 								# --------------------------------------------------
 								registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
 								equivalences = registry["equivalences"]
 								canonical_pages = registry["canonical_pages"]
 								valid_targets = set(canonical_pages.values())
 								# --------------------------------------------------
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								# HELPERS
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
-												keep error pages with fallback content

											
										
										
											2026-04-15 10:36:21 +02:00
+								def normalize_title(title: str) -> str:
-												avoid overwrite homonym canonicals

											
										
										
											2026-04-15 12:10:32 +02:00
+								    if not title:
 								        return
-												keep error pages with fallback content

											
										
										
											2026-04-15 10:36:21 +02:00
+								    title = title.strip()
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								    title = unquote(title)
 								    title = Path(title).stem
-												keep error pages with fallback content

											
										
										
											2026-04-15 10:36:21 +02:00
+								    title = unicodedata.normalize("NFKC", title)
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    title = title.replace("_", " ")
-												keep error pages with fallback content

											
										
										
											2026-04-15 10:36:21 +02:00
+								    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
 								    title = re.sub(r"\s+", " ", title)
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    return title.casefold()
 								def extract_mediawiki_target(href: str):
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								    if not href:
 								        return None
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    if href.startswith("#"):
 								        return None
 								    parsed = urlparse(href)
 								    if parsed.scheme in ("http", "https"):
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        return None
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    path = parsed.path or ""
 								    if "/wiki/" in path:
 								        return path.split("/wiki/", 1)[1]
 								    if "index.php" in path:
 								        qs = parse_qs(parsed.query)
 								        if "title" in qs:
 								            return qs["title"][0]
 								    return Path(path).stem
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								def is_ignored_namespace(title_norm: str):
 								    return title_norm.startswith(IGNORED_PREFIXES)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
 								def extract_article_links(soup):
 								    content = soup.find("div", id="mw-content-text")
 								    if not content:
 								        return []
 								    links = []
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								    for a in content.select("a[href]"):
 								        if a.find_parent(class_="navbox"):
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								            continue
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								        links.append({
 								            "href": a.get("href"),
 								            "title": a.get("title"),
 								            "text": a.get_text(strip=True),
 								        })
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								    return links
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								def resolve_link(raw_target, title_attr):
 								    candidates = []
 								    if title_attr:
 								        candidates.append(title_attr)
 								    if raw_target:
 								        candidates.append(raw_target)
 								    for candidate in candidates:
 								        norm = normalize_title(candidate)
 								        if not norm:
 								            continue
 								        if is_ignored_namespace(norm):
 								            return None, "ignored"
 								        if norm in equivalences:
 								            return equivalences[norm], "equivalence"
 								        filename = norm.replace(" ", "_") + ".html"
 								        if filename in valid_targets:
 								            return filename, "direct"
 								    return None, "unresolved"
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
 								# MAIN SCAN
 								# --------------------------------------------------
 								resolved_links = []
 								unresolved_links = []
 								files = list(PAGES_DIR.glob("*.html"))
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								print(f"{len(files)} pages à analyser")
 								for i, file_path in enumerate(files, 1):
 								    html = file_path.read_text(encoding="utf-8", errors="ignore")
 								    soup = BeautifulSoup(html, "html.parser")
 								    links = extract_article_links(soup)
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								    for link in links:
 								        raw_target = extract_mediawiki_target(link["href"])
 								        resolved, method = resolve_link(raw_target, link["title"])
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        entry = {
 								            "source": file_path.name,
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								            "href": link["href"],
 								            "title": link["title"],
 								            "method": method,
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        }
 								        if resolved:
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								            entry["resolved"] = resolved
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								            resolved_links.append(entry)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        else:
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								            entry["raw_target"] = raw_target
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								            unresolved_links.append(entry)
-												WIP

											
										
										
											2026-04-16 10:04:58 +02:00
+								    if i % 200 == 0:
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
+								        print(f"{i}/{len(files)} analysées")
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								# --------------------------------------------------
 								# SAVE RESULTS
 								# --------------------------------------------------
 								json.dump(
 								    resolved_links,
 								    open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
 								    indent=2,
 								    ensure_ascii=False,
 								)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								json.dump(
 								    unresolved_links,
 								    open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
 								    indent=2,
 								    ensure_ascii=False,
 								)
-												tweak paths and find links

											
										
										
											2026-04-03 15:50:40 +02:00
 								print("\n✅ LINK SCAN COMPLETE")
-												fix maaping canonical preference

											
										
										
											2026-04-07 15:06:30 +02:00
+								print("Resolved:", len(resolved_links))
 								print("Unresolved:", len(unresolved_links))