tweak paths and find links

2026-04-03 15:50:40 +02:00 · 2026-04-03 15:50:40 +02:00 · e4aaa33137
commit e4aaa33137
parent 36c8bb2354
7 changed files with 309 additions and 10 deletions
--- a/build_link_registry.py
+++ b/build_link_registry.py
@ -0,0 +1,177 @@
+from pathlib import Path
+import json
+import re
+from bs4 import BeautifulSoup
+
+# ======================
+# CONFIG
+# ======================
+
+INPUT_DIR = Path("../unique_pages")
+OUTPUT_DIR = Path("../link_registry")
+
+OUTPUT_DIR.mkdir(exist_ok=True)
+
+# MediaWiki-like prefixes
+PREFIXES = [
+    "category:",
+    "category_",
+    "file:",
+    "template:",
+]
+
+# ======================
+# HELPERS
+# ======================
+
+def normalize(title: str) -> str:
+    """Canonical key normalization."""
+    if not title:
+        return ""
+
+    title = title.strip()
+    title = title.replace("_", " ")
+    title = re.sub(r"\s+", " ", title)
+
+    return title.lower()
+
+
+def slugify(title: str) -> str:
+    return normalize(title).replace(" ", "_")
+
+
+def strip_prefix(title: str) -> str:
+    t = title.lower()
+    for p in PREFIXES:
+        if t.startswith(p):
+            return title[len(p):]
+    return title
+
+
+def extract_title(soup: BeautifulSoup):
+    """Try multiple strategies to extract page title."""
+
+    # Strategy 1: MediaWiki heading
+    h1 = soup.find("h1", id="firstHeading")
+    if h1:
+        return h1.get_text(strip=True)
+
+    # Strategy 2: HTML title
+    if soup.title:
+        title = soup.title.get_text()
+        if "-" in title:
+            return title.split("-")[0].strip()
+        return title.strip()
+
+    return None
+
+
+def detect_redirect(soup: BeautifulSoup):
+    """Detect MediaWiki redirect pages."""
+    text = soup.get_text(" ", strip=True).lower()
+
+    # HTTrack redirects often contain this
+    if "#redirect" in text:
+        link = soup.find("a")
+        if link and link.get("href"):
+            return link["href"]
+
+    # alternative pattern
+    redirect_note = soup.find(class_="redirectText")
+    if redirect_note:
+        link = redirect_note.find("a")
+        if link:
+            return link.get("href")
+
+    return None
+
+
+# ======================
+# MAIN
+# ======================
+
+title_registry = {}
+alias_registry = {}
+redirects = {}
+unresolved = []
+
+files = list(INPUT_DIR.glob("*.html"))
+print(f"{len(files)} fichiers trouvés")
+
+for i, file_path in enumerate(files, 1):
+
+    try:
+        html = file_path.read_text(encoding="utf-8", errors="ignore")
+        soup = BeautifulSoup(html, "html.parser")
+
+        title = extract_title(soup)
+
+        if not title:
+            unresolved.append({
+                "file": file_path.name,
+                "reason": "no_title_found"
+            })
+            continue
+
+        key = slugify(title)
+
+        # register canonical page
+        title_registry[key] = {
+            "title": title,
+            "file": str(file_path)
+        }
+
+        # detect redirect
+        redirect_href = detect_redirect(soup)
+        if redirect_href:
+            target = Path(redirect_href).stem
+            target_key = slugify(target)
+
+            redirects[key] = target_key
+            alias_registry[key] = target_key
+
+    except Exception as e:
+        unresolved.append({
+            "file": file_path.name,
+            "reason": str(e)
+        })
+
+    if i % 100 == 0:
+        print(f"{i}/{len(files)} traités")
+
+# ======================
+# AUTO ALIAS GENERATION
+# ======================
+
+auto_alias_count = 0
+
+for key in list(title_registry.keys()):
+    stripped = slugify(strip_prefix(key))
+
+    if stripped != key and stripped in title_registry:
+        alias_registry[key] = stripped
+        auto_alias_count += 1
+
+print(f"Alias automatiques ajoutés: {auto_alias_count}")
+
+# ======================
+# SAVE FILES
+# ======================
+
+with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f:
+    json.dump(title_registry, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f:
+    json.dump(alias_registry, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f:
+    json.dump(redirects, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f:
+    json.dump(unresolved, f, indent=2, ensure_ascii=False)
+
+print("\n✅ REGISTRY BUILD COMPLETE")
+print(f"Pages uniques: {len(title_registry)}")
+print(f"Alias: {len(alias_registry)}")
+print(f"Redirects: {len(redirects)}")
+print(f"Problèmes: {len(unresolved)}")