whu_migration_scripts/scan_internal_links.py

from pathlib import Path
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse

INPUT_DIR = Path("../unique_pages")
REGISTRY_DIR = Path("../link_registry")

title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))

OUTPUT_RESOLVED = []
OUTPUT_UNRESOLVED = []

# ======================
# HELPERS
# ======================

def normalize_href(href: str):
    if not href:
        return None

    # ignore external links
    if href.startswith("http"):
        return None

    name = Path(href).stem
    return name.lower()


def resolve(name):
    if name in title_registry:
        return name

    if name in alias_registry:
        return alias_registry[name]

    # try removing category prefix
    if name.startswith("category_"):
        alt = name.replace("category_", "", 1)
        if alt in title_registry:
            return alt

    return None


def extract_article_links(soup):

    content = soup.find("div", id="mw-content-text")
    if not content:
        return []

    links = []

    for a in content.find_all("a", href=True):

        href = a["href"]

        # ignore anchors
        if href.startswith("#"):
            continue

        # ignore files/images/history/etc
        if any(prefix in href.lower() for prefix in [
            "file_",
            "image:",
            "special:",
            "action=",
        ]):
            continue

        links.append(href)

    return links


# ======================
# MAIN
# ======================

files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")

for i, file_path in enumerate(files, 1):

    html = file_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    links = extract_article_links(soup)

    for href in links:

        key = normalize_href(href)
        if not key:
            continue

        resolved = resolve(key)

        entry = {
            "source": file_path.name,
            "link": href,
        }

        if resolved:
            entry["target"] = resolved
            OUTPUT_RESOLVED.append(entry)
        else:
            OUTPUT_UNRESOLVED.append(entry)

    if i % 100 == 0:
        print(f"{i}/{len(files)} analysées")

# ======================
# SAVE
# ======================

json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)

print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(OUTPUT_RESOLVED))
print("Unresolved:", len(OUTPUT_UNRESOLVED))
tweak paths and find links 2026-04-03 15:50:40 +02:00			`from pathlib import Path`
			`import json`
			`from bs4 import BeautifulSoup`
			`from urllib.parse import urlparse`

			`INPUT_DIR = Path("../unique_pages")`
			`REGISTRY_DIR = Path("../link_registry")`

			`title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))`
			`alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))`

			`OUTPUT_RESOLVED = []`
			`OUTPUT_UNRESOLVED = []`

			`# ======================`
			`# HELPERS`
			`# ======================`

			`def normalize_href(href: str):`
			`if not href:`
			`return None`

			`# ignore external links`
			`if href.startswith("http"):`
			`return None`

			`name = Path(href).stem`
			`return name.lower()`


			`def resolve(name):`
			`if name in title_registry:`
			`return name`

			`if name in alias_registry:`
			`return alias_registry[name]`

			`# try removing category prefix`
			`if name.startswith("category_"):`
			`alt = name.replace("category_", "", 1)`
			`if alt in title_registry:`
			`return alt`

			`return None`


			`def extract_article_links(soup):`

			`content = soup.find("div", id="mw-content-text")`
			`if not content:`
			`return []`

			`links = []`

			`for a in content.find_all("a", href=True):`

			`href = a["href"]`

			`# ignore anchors`
			`if href.startswith("#"):`
			`continue`

			`# ignore files/images/history/etc`
			`if any(prefix in href.lower() for prefix in [`
			`"file_",`
			`"image:",`
			`"special:",`
			`"action=",`
			`]):`
			`continue`

			`links.append(href)`

			`return links`


			`# ======================`
			`# MAIN`
			`# ======================`

			`files = list(INPUT_DIR.glob("*.html"))`
			`print(f"{len(files)} pages à analyser")`

			`for i, file_path in enumerate(files, 1):`

			`html = file_path.read_text(encoding="utf-8", errors="ignore")`
			`soup = BeautifulSoup(html, "html.parser")`

			`links = extract_article_links(soup)`

			`for href in links:`

			`key = normalize_href(href)`
			`if not key:`
			`continue`

			`resolved = resolve(key)`

			`entry = {`
			`"source": file_path.name,`
			`"link": href,`
			`}`

			`if resolved:`
			`entry["target"] = resolved`
			`OUTPUT_RESOLVED.append(entry)`
			`else:`
			`OUTPUT_UNRESOLVED.append(entry)`

			`if i % 100 == 0:`
			`print(f"{i}/{len(files)} analysées")`

			`# ======================`
			`# SAVE`
			`# ======================`

			`json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)`
			`json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)`

			`print("\n✅ LINK SCAN COMPLETE")`
			`print("Resolved:", len(OUTPUT_RESOLVED))`
			`print("Unresolved:", len(OUTPUT_UNRESOLVED))`