whu_migration_scripts/build_link_registry.py

from pathlib import Path
import json
import re
from bs4 import BeautifulSoup

# ======================
# CONFIG
# ======================

INPUT_DIR = Path("../unique_pages")
OUTPUT_DIR = Path("../link_registry")

OUTPUT_DIR.mkdir(exist_ok=True)

# MediaWiki-like prefixes
PREFIXES = [
    "category:",
    "category_",
    "file:",
    "template:",
]

# ======================
# HELPERS
# ======================

def normalize(title: str) -> str:
    """Canonical key normalization."""
    if not title:
        return ""

    title = title.strip()
    title = title.replace("_", " ")
    title = re.sub(r"\s+", " ", title)

    return title.lower()


def slugify(title: str) -> str:
    return normalize(title).replace(" ", "_")


def strip_prefix(title: str) -> str:
    t = title.lower()
    for p in PREFIXES:
        if t.startswith(p):
            return title[len(p):]
    return title


def extract_title(soup: BeautifulSoup):
    """Try multiple strategies to extract page title."""

    # Strategy 1: MediaWiki heading
    h1 = soup.find("h1", id="firstHeading")
    if h1:
        return h1.get_text(strip=True)

    # Strategy 2: HTML title
    if soup.title:
        title = soup.title.get_text()
        if "-" in title:
            return title.split("-")[0].strip()
        return title.strip()

    return None


def detect_redirect(soup: BeautifulSoup):
    """Detect MediaWiki redirect pages."""
    text = soup.get_text(" ", strip=True).lower()

    # HTTrack redirects often contain this
    if "#redirect" in text:
        link = soup.find("a")
        if link and link.get("href"):
            return link["href"]

    # alternative pattern
    redirect_note = soup.find(class_="redirectText")
    if redirect_note:
        link = redirect_note.find("a")
        if link:
            return link.get("href")

    return None


# ======================
# MAIN
# ======================

title_registry = {}
alias_registry = {}
redirects = {}
unresolved = []

files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} fichiers trouvés")

for i, file_path in enumerate(files, 1):

    try:
        html = file_path.read_text(encoding="utf-8", errors="ignore")
        soup = BeautifulSoup(html, "html.parser")

        title = extract_title(soup)

        if not title:
            unresolved.append({
                "file": file_path.name,
                "reason": "no_title_found"
            })
            continue

        key = slugify(title)

        # register canonical page
        title_registry[key] = {
            "title": title,
            "file": str(file_path)
        }

        # detect redirect
        redirect_href = detect_redirect(soup)
        if redirect_href:
            target = Path(redirect_href).stem
            target_key = slugify(target)

            redirects[key] = target_key
            alias_registry[key] = target_key

    except Exception as e:
        unresolved.append({
            "file": file_path.name,
            "reason": str(e)
        })

    if i % 100 == 0:
        print(f"{i}/{len(files)} traités")

# ======================
# AUTO ALIAS GENERATION
# ======================

auto_alias_count = 0

for key in list(title_registry.keys()):
    stripped = slugify(strip_prefix(key))

    if stripped != key and stripped in title_registry:
        alias_registry[key] = stripped
        auto_alias_count += 1

print(f"Alias automatiques ajoutés: {auto_alias_count}")

# ======================
# SAVE FILES
# ======================

with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f:
    json.dump(title_registry, f, indent=2, ensure_ascii=False)

with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f:
    json.dump(alias_registry, f, indent=2, ensure_ascii=False)

with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f:
    json.dump(redirects, f, indent=2, ensure_ascii=False)

with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f:
    json.dump(unresolved, f, indent=2, ensure_ascii=False)

print("\n✅ REGISTRY BUILD COMPLETE")
print(f"Pages uniques: {len(title_registry)}")
print(f"Alias: {len(alias_registry)}")
print(f"Redirects: {len(redirects)}")
print(f"Problèmes: {len(unresolved)}")
tweak paths and find links 2026-04-03 15:50:40 +02:00			`from pathlib import Path`
			`import json`
			`import re`
			`from bs4 import BeautifulSoup`

			`# ======================`
			`# CONFIG`
			`# ======================`

			`INPUT_DIR = Path("../unique_pages")`
			`OUTPUT_DIR = Path("../link_registry")`

			`OUTPUT_DIR.mkdir(exist_ok=True)`

			`# MediaWiki-like prefixes`
			`PREFIXES = [`
			`"category:",`
			`"category_",`
			`"file:",`
			`"template:",`
			`]`

			`# ======================`
			`# HELPERS`
			`# ======================`

			`def normalize(title: str) -> str:`
			`"""Canonical key normalization."""`
			`if not title:`
			`return ""`

			`title = title.strip()`
			`title = title.replace("_", " ")`
			`title = re.sub(r"\s+", " ", title)`

			`return title.lower()`


			`def slugify(title: str) -> str:`
			`return normalize(title).replace(" ", "_")`


			`def strip_prefix(title: str) -> str:`
			`t = title.lower()`
			`for p in PREFIXES:`
			`if t.startswith(p):`
			`return title[len(p):]`
			`return title`


			`def extract_title(soup: BeautifulSoup):`
			`"""Try multiple strategies to extract page title."""`

			`# Strategy 1: MediaWiki heading`
			`h1 = soup.find("h1", id="firstHeading")`
			`if h1:`
			`return h1.get_text(strip=True)`

			`# Strategy 2: HTML title`
			`if soup.title:`
			`title = soup.title.get_text()`
			`if "-" in title:`
			`return title.split("-")[0].strip()`
			`return title.strip()`

			`return None`


			`def detect_redirect(soup: BeautifulSoup):`
			`"""Detect MediaWiki redirect pages."""`
			`text = soup.get_text(" ", strip=True).lower()`

			`# HTTrack redirects often contain this`
			`if "#redirect" in text:`
			`link = soup.find("a")`
			`if link and link.get("href"):`
			`return link["href"]`

			`# alternative pattern`
			`redirect_note = soup.find(class_="redirectText")`
			`if redirect_note:`
			`link = redirect_note.find("a")`
			`if link:`
			`return link.get("href")`

			`return None`


			`# ======================`
			`# MAIN`
			`# ======================`

			`title_registry = {}`
			`alias_registry = {}`
			`redirects = {}`
			`unresolved = []`

			`files = list(INPUT_DIR.glob("*.html"))`
			`print(f"{len(files)} fichiers trouvés")`

			`for i, file_path in enumerate(files, 1):`

			`try:`
			`html = file_path.read_text(encoding="utf-8", errors="ignore")`
			`soup = BeautifulSoup(html, "html.parser")`

			`title = extract_title(soup)`

			`if not title:`
			`unresolved.append({`
			`"file": file_path.name,`
			`"reason": "no_title_found"`
			`})`
			`continue`

			`key = slugify(title)`

			`# register canonical page`
			`title_registry[key] = {`
			`"title": title,`
			`"file": str(file_path)`
			`}`

			`# detect redirect`
			`redirect_href = detect_redirect(soup)`
			`if redirect_href:`
			`target = Path(redirect_href).stem`
			`target_key = slugify(target)`

			`redirects[key] = target_key`
			`alias_registry[key] = target_key`

			`except Exception as e:`
			`unresolved.append({`
			`"file": file_path.name,`
			`"reason": str(e)`
			`})`

			`if i % 100 == 0:`
			`print(f"{i}/{len(files)} traités")`

			`# ======================`
			`# AUTO ALIAS GENERATION`
			`# ======================`

			`auto_alias_count = 0`

			`for key in list(title_registry.keys()):`
			`stripped = slugify(strip_prefix(key))`

			`if stripped != key and stripped in title_registry:`
			`alias_registry[key] = stripped`
			`auto_alias_count += 1`

			`print(f"Alias automatiques ajoutés: {auto_alias_count}")`

			`# ======================`
			`# SAVE FILES`
			`# ======================`

			`with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f:`
			`json.dump(title_registry, f, indent=2, ensure_ascii=False)`

			`with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f:`
			`json.dump(alias_registry, f, indent=2, ensure_ascii=False)`

			`with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f:`
			`json.dump(redirects, f, indent=2, ensure_ascii=False)`

			`with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f:`
			`json.dump(unresolved, f, indent=2, ensure_ascii=False)`

			`print("\n✅ REGISTRY BUILD COMPLETE")`
			`print(f"Pages uniques: {len(title_registry)}")`
			`print(f"Alias: {len(alias_registry)}")`
			`print(f"Redirects: {len(redirects)}")`
			`print(f"Problèmes: {len(unresolved)}")`