fix maaping canonical preference

2026-04-07 15:06:30 +02:00 · 2026-04-07 15:06:30 +02:00 · 7f019ed98c
commit 7f019ed98c
parent 90dd3cc152
2 changed files with 242 additions and 72 deletions
--- a/scan_internal_links.py
+++ b/scan_internal_links.py
@ -1,48 +1,100 @@
 from pathlib import Path
 import json
+import re
 from bs4 import BeautifulSoup
-from urllib.parse import urlparse
+from urllib.parse import urlparse, parse_qs, unquote

-INPUT_DIR = Path("../unique_pages")
-REGISTRY_DIR = Path("../link_registry")
+# --------------------------------------------------
+# PATHS
+# --------------------------------------------------

-title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
-alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
+PAGES_DIR = Path("../output/pages")
+REGISTRY_PATH = Path("../output/equivalence_registry.json")
+OUTPUT_DIR = Path("../output/link_scan")

-OUTPUT_RESOLVED = []
-OUTPUT_UNRESOLVED = []
+OUTPUT_DIR.mkdir(exist_ok=True)

-# ======================
+# --------------------------------------------------
+# LOAD REGISTRY
+# --------------------------------------------------
+
+registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
+
+equivalences = registry["equivalences"]
+canonical_pages = registry["canonical_pages"]
+
+valid_targets = set(canonical_pages.values())
+
+# --------------------------------------------------
 # HELPERS
-# ======================
+# --------------------------------------------------
+
+def normalize_title(title: str | None):
+    if not title:
+        return None
+
+    title = unquote(title)
+    title = title.replace("_", " ")
+    title = re.sub(r"\s+", " ", title.strip())
+    return title.casefold()
+
+
+# -------------------------
+# Extract MediaWiki target
+# -------------------------
+
+def extract_mediawiki_target(href: str):

-def normalize_href(href: str):
    if not href:
        return None

-    # ignore external links
-    if href.startswith("http"):
+    # ignore anchors
+    if href.startswith("#"):
        return None

-    name = Path(href).stem
-    return name.lower()
+    parsed = urlparse(href)
+
+    # external link
+    if parsed.scheme in ("http", "https"):
+        return None
+
+    path = parsed.path or ""
+
+    # /wiki/Page_Name
+    if "/wiki/" in path:
+        return path.split("/wiki/", 1)[1]
+
+    # index.php?title=Page
+    if "index.php" in path:
+        qs = parse_qs(parsed.query)
+        if "title" in qs:
+            return qs["title"][0]
+
+    # fallback filename-like
+    return Path(path).stem


-def resolve(name):
-    if name in title_registry:
-        return name
+# -------------------------
+# Ignore unwanted namespaces
+# -------------------------

-    if name in alias_registry:
-        return alias_registry[name]
+IGNORED_PREFIXES = (
+    "file:",
+    "image:",
+    "template:",
+    "special:",
+    "help:",
+    "user:",
+    "talk:",
+)

-    # try removing category prefix
-    if name.startswith("category_"):
-        alt = name.replace("category_", "", 1)
-        if alt in title_registry:
-            return alt
+def is_ignored_namespace(title_norm: str):
+    return title_norm.startswith(IGNORED_PREFIXES)

-    return None

+# -------------------------
+# Extract article content
+# -------------------------

 def extract_article_links(soup):

@ -52,33 +104,26 @@ def extract_article_links(soup):

    links = []

-    for a in content.find_all("a", href=True):
+    for a in content.select("a[href]"):

-        href = a["href"]
-
-        # ignore anchors
-        if href.startswith("#"):
-            continue
-
-        # ignore files/images/history/etc
-        if any(prefix in href.lower() for prefix in [
-            "file_",
-            "image:",
-            "special:",
-            "action=",
-        ]):
+        # ignore navboxes / metadata
+        if a.find_parent(class_="navbox"):
            continue

+        href = a.get("href")
        links.append(href)

    return links


-# ======================
-# MAIN
-# ======================
+# --------------------------------------------------
+# MAIN SCAN
+# --------------------------------------------------

-files = list(INPUT_DIR.glob("*.html"))
+resolved_links = []
+unresolved_links = []
+
+files = list(PAGES_DIR.glob("*.html"))
 print(f"{len(files)} pages à analyser")

 for i, file_path in enumerate(files, 1):
@ -90,33 +135,50 @@ for i, file_path in enumerate(files, 1):

    for href in links:

-        key = normalize_href(href)
-        if not key:
+        raw_target = extract_mediawiki_target(href)
+        norm = normalize_title(raw_target)
+
+        if not norm:
            continue

-        resolved = resolve(key)
+        if is_ignored_namespace(norm):
+            continue

        entry = {
            "source": file_path.name,
-            "link": href,
+            "href": href,
+            "normalized": norm,
        }

+        resolved = equivalences.get(norm)
+
        if resolved:
-            entry["target"] = resolved
-            OUTPUT_RESOLVED.append(entry)
+            entry["resolved_title"] = resolved
+            resolved_links.append(entry)
        else:
-            OUTPUT_UNRESOLVED.append(entry)
+            unresolved_links.append(entry)

    if i % 100 == 0:
        print(f"{i}/{len(files)} analysées")

-# ======================
-# SAVE
-# ======================
+# --------------------------------------------------
+# SAVE RESULTS
+# --------------------------------------------------

-json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
-json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
+json.dump(
+    resolved_links,
+    open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
+    indent=2,
+    ensure_ascii=False,
+)
+
+json.dump(
+    unresolved_links,
+    open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
+    indent=2,
+    ensure_ascii=False,
+)

 print("\n✅ LINK SCAN COMPLETE")
-print("Resolved:", len(OUTPUT_RESOLVED))
-print("Unresolved:", len(OUTPUT_UNRESOLVED))
+print("Resolved:", len(resolved_links))
+print("Unresolved:", len(unresolved_links))