tweak paths and find links

2026-04-03 15:50:40 +02:00 · 2026-04-03 15:50:40 +02:00 · e4aaa33137
commit e4aaa33137
parent 36c8bb2354
7 changed files with 309 additions and 10 deletions
--- a/scan_internal_links.py
+++ b/scan_internal_links.py
@ -0,0 +1,122 @@
+from pathlib import Path
+import json
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+
+INPUT_DIR = Path("../unique_pages")
+REGISTRY_DIR = Path("../link_registry")
+
+title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
+alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
+
+OUTPUT_RESOLVED = []
+OUTPUT_UNRESOLVED = []
+
+# ======================
+# HELPERS
+# ======================
+
+def normalize_href(href: str):
+    if not href:
+        return None
+
+    # ignore external links
+    if href.startswith("http"):
+        return None
+
+    name = Path(href).stem
+    return name.lower()
+
+
+def resolve(name):
+    if name in title_registry:
+        return name
+
+    if name in alias_registry:
+        return alias_registry[name]
+
+    # try removing category prefix
+    if name.startswith("category_"):
+        alt = name.replace("category_", "", 1)
+        if alt in title_registry:
+            return alt
+
+    return None
+
+
+def extract_article_links(soup):
+
+    content = soup.find("div", id="mw-content-text")
+    if not content:
+        return []
+
+    links = []
+
+    for a in content.find_all("a", href=True):
+
+        href = a["href"]
+
+        # ignore anchors
+        if href.startswith("#"):
+            continue
+
+        # ignore files/images/history/etc
+        if any(prefix in href.lower() for prefix in [
+            "file_",
+            "image:",
+            "special:",
+            "action=",
+        ]):
+            continue
+
+        links.append(href)
+
+    return links
+
+
+# ======================
+# MAIN
+# ======================
+
+files = list(INPUT_DIR.glob("*.html"))
+print(f"{len(files)} pages à analyser")
+
+for i, file_path in enumerate(files, 1):
+
+    html = file_path.read_text(encoding="utf-8", errors="ignore")
+    soup = BeautifulSoup(html, "html.parser")
+
+    links = extract_article_links(soup)
+
+    for href in links:
+
+        key = normalize_href(href)
+        if not key:
+            continue
+
+        resolved = resolve(key)
+
+        entry = {
+            "source": file_path.name,
+            "link": href,
+        }
+
+        if resolved:
+            entry["target"] = resolved
+            OUTPUT_RESOLVED.append(entry)
+        else:
+            OUTPUT_UNRESOLVED.append(entry)
+
+    if i % 100 == 0:
+        print(f"{i}/{len(files)} analysées")
+
+# ======================
+# SAVE
+# ======================
+
+json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
+json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
+
+print("\n✅ LINK SCAN COMPLETE")
+print("Resolved:", len(OUTPUT_RESOLVED))
+print("Unresolved:", len(OUTPUT_UNRESOLVED))