fix maaping canonical preference

2026-04-07 15:06:30 +02:00 · 2026-04-07 15:06:30 +02:00 · 7f019ed98c
commit 7f019ed98c
parent 90dd3cc152
2 changed files with 242 additions and 72 deletions
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@ -5,6 +5,7 @@ import shutil
 import html
 from pathlib import Path
 from collections import defaultdict
 from difflib import SequenceMatcher
 SOURCE_DIR = Path("../original_index")
 OUTPUT_DIR = Path("../output")
@ -24,6 +25,11 @@ INVALID_WIN_CHARS = r'[<>:"/\\|?*]'
 ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
 IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
 NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
 WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
 SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
 def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()
 def normalize_title(title: str) -> str:
    title = title.strip()
@ -77,6 +83,25 @@ def extract_namespace(html: str) -> str:
    return ""
 def extract_wg_title(page_html):
    m = WG_TITLE_RE.search(page_html)
    if m:
        return html.unescape(m.group(1))
    return None
 def normalize_reference_key(key: str) -> str:
    key = normalize_title(key)
    # normalise namespace category
    key = re.sub(r"^category[\s:_]+", "", key)
    # collapse espaces
    key = re.sub(r"\s+", " ", key)
    return key.strip()
 # --------------------------------------------------
 # Registry structures
 # --------------------------------------------------
@ -121,6 +146,7 @@ for i, path in enumerate(files, 1):
        base_title = norm
        is_redirect = bool(IS_REDIRECT_RE.search(page_html))
        is_category = ns == "Category" or norm.startswith("category:")
        wg_title = extract_wg_title(page_html)
        # Categories
        if ns == "Category":
@ -148,11 +174,13 @@ for i, path in enumerate(files, 1):
        if redir:
            redirects[full_title] = normalize_title(redir)
        canonical_key = normalize_reference_key(full_title)
        all_variants[article_id].append({
            "path": path,
            "title": base_title,
-            "full_title": full_title,
+            "canonical_key": full_title,
            "article_id": article_id,
            "wg_title": normalize_title(wg_title) if wg_title else None,
            "redirect": is_redirect,
            "is_category": is_category,
        })
@ -162,6 +190,7 @@ for i, path in enumerate(files, 1):
    if i % 200 == 0:
        print(f"{i}/{len(files)} analysés")
 print("Variants collected:", len(all_variants))
 # --------------------------------------------------
 # PASS 2 — choix des versions canoniques
@ -173,15 +202,52 @@ category_replaced = 0
 nb_all_cat = 0
 def slug_to_title(filename: str) -> str:
    name = Path(filename).stem
    name = re.sub(r"\d+$", "", name)
    return normalize_title(name)
 def filename_similarity_score(filename, wg_title):
    if not wg_title:
        return 0
    filename = normalize_title(filename)
    wg_title = normalize_title(wg_title)
    # enlève chiffres suffixes
    filename = re.sub(r"\d+$", "", filename)
    return similarity(filename, wg_title)
 def variant_score(v):
-    """
+
-    Plus le score est petit → meilleur candidat.
+    filename = v["path"].stem
-    """
+    filename_norm = normalize_title(filename)
    similarity_score = filename_similarity_score(
        filename_norm,
        v["wg_title"]
    )
    is_short_slug = bool(
        SHORT_SLUG_RE.match(filename_norm.replace(" ", ""))
    )
    long_title_penalty = (
        "," in filename or
        "_" in filename or
        len(filename) > 40
    )
    return (
-        v["is_category"],       # False (0) meilleur que True (1)
+        v["is_category"],
-        v["redirect"],          # False meilleur
+        v["redirect"],
-        "category:" in v["path"].name.lower(),  # sécurité filename
+        not is_short_slug,
-        len(v["path"].name),    # stabilité
+        long_title_penalty,
        -similarity_score,
        len(filename),
        filename.lower(),
    )
@ -189,6 +255,7 @@ for article_id, variants in all_variants.items():
    # tri déterministe
    variants_sorted = sorted(variants, key=variant_score)
    print(f"variants_sorted: {variants_sorted}")
    chosen = variants_sorted[0]
@ -198,16 +265,34 @@ for article_id, variants in all_variants.items():
    if chosen["is_category"]:
        category_replaced += 1
    canonical_title = normalize_reference_key(chosen["title"])
    canonical_pages[article_id] = {
        "path": chosen["path"],
-        "title": chosen["title"],
+        "title": canonical_title,
        "redirect": chosen["redirect"],
    }
    # équivalences
    for v in variants:
-        equivalences[v["full_title"]] = chosen["title"]
+        equivalences[v["canonical_key"]] = chosen["title"]
 equivalences.clear()
 def add_equivalence(k, v):
    k = normalize_reference_key(k)
    v = normalize_reference_key(v)
    if k != v:
        equivalences[k] = v
 for article_id, variants in all_variants.items():
    canonical_title = canonical_pages[article_id]["title"]
    canonical_slug = Path(canonical_pages[article_id]["path"]).stem
    for v in variants:
        add_equivalence(v["canonical_key"], canonical_slug)
        filename_key = normalize_title(Path(v["path"]).stem)
        add_equivalence(filename_key, canonical_slug)
 print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
 print(f"{category_replaced} 'category_*' remplacées par leur version de base")
@ -231,7 +316,7 @@ for src, dst in list(redirects.items()):
    equivalences[src] = final
 redirects.clear()
-    # --------------------------------------------------
+# --------------------------------------------------
 # PASS 4 — normalisation finale des equivalences
 # --------------------------------------------------
@ -243,21 +328,41 @@ valid_titles = {
 for k, v in list(equivalences.items()):
    if v not in valid_titles:
        equivalences[k] = equivalences.get(v, v)
-
+# category:* ou category_* comme clés
 for k, v in list(equivalences.items()):
    new_k = re.sub(r"^category[\s:_]+", "category ", k)
    if new_k != k:
        equivalences[new_k] = v
        del equivalences[k]
 # invariant registry
 for k, v in equivalences.items():
    if v not in valid_titles:
        problems.append(f"Non canonical mapping: {k} -> {v}")
-        
+equivalences = {
    k: v for k, v in equivalences.items()
    if k != v
 }
 # --------------------------------------------------
 # PASS 5 — copie des pages canoniques
 # --------------------------------------------------
 def title_to_filename(title: str) -> str:
    return sanitize_filename(
        title.replace(" ", "_").casefold() + ".html"
    )
 copied = 0
-for key, data in canonical_pages.items():
+total = len(canonical_pages)
 for i, (key, data) in enumerate(canonical_pages.items(), 1):
    src = data["path"]
-    dst_name = sanitize_filename(src.name)
+
    dst_name = sanitize_filename(src.name.casefold())
    dst = PAGES_DIR / dst_name
    try:
        shutil.copy2(src, dst)
        canonical_pages[key] = dst_name
@ -265,6 +370,9 @@ for key, data in canonical_pages.items():
    except Exception as e:
        problems.append(f"Copy failed {src}: {e}")
    if i % 200 == 0 or i == total:
        print(f"{i}/{total} copiés")
 print(f"{copied} pages copiées")
 # --------------------------------------------------
--- a/scan_internal_links.py
+++ b/scan_internal_links.py
@ -1,48 +1,100 @@
 from pathlib import Path
 import json
 import re
 from bs4 import BeautifulSoup
-from urllib.parse import urlparse
+from urllib.parse import urlparse, parse_qs, unquote
-INPUT_DIR = Path("../unique_pages")
+# --------------------------------------------------
-REGISTRY_DIR = Path("../link_registry")
+# PATHS
 # --------------------------------------------------
-title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
+PAGES_DIR = Path("../output/pages")
-alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
+REGISTRY_PATH = Path("../output/equivalence_registry.json")
 OUTPUT_DIR = Path("../output/link_scan")
-OUTPUT_RESOLVED = []
+OUTPUT_DIR.mkdir(exist_ok=True)
 OUTPUT_UNRESOLVED = []
-# ======================
+# --------------------------------------------------
 # LOAD REGISTRY
 # --------------------------------------------------
 registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
 equivalences = registry["equivalences"]
 canonical_pages = registry["canonical_pages"]
 valid_targets = set(canonical_pages.values())
 # --------------------------------------------------
 # HELPERS
-# ======================
+# --------------------------------------------------
 def normalize_title(title: str | None):
    if not title:
        return None
    title = unquote(title)
    title = title.replace("_", " ")
    title = re.sub(r"\s+", " ", title.strip())
    return title.casefold()
 # -------------------------
 # Extract MediaWiki target
 # -------------------------
 def extract_mediawiki_target(href: str):
 def normalize_href(href: str):
    if not href:
        return None
-    # ignore external links
+    # ignore anchors
-    if href.startswith("http"):
+    if href.startswith("#"):
        return None
-    name = Path(href).stem
+    parsed = urlparse(href)
-    return name.lower()
+
    # external link
    if parsed.scheme in ("http", "https"):
        return None
    path = parsed.path or ""
    # /wiki/Page_Name
    if "/wiki/" in path:
        return path.split("/wiki/", 1)[1]
    # index.php?title=Page
    if "index.php" in path:
        qs = parse_qs(parsed.query)
        if "title" in qs:
            return qs["title"][0]
    # fallback filename-like
    return Path(path).stem
-def resolve(name):
+# -------------------------
-    if name in title_registry:
+# Ignore unwanted namespaces
-        return name
+# -------------------------
-    if name in alias_registry:
+IGNORED_PREFIXES = (
-        return alias_registry[name]
+    "file:",
    "image:",
    "template:",
    "special:",
    "help:",
    "user:",
    "talk:",
 )
-    # try removing category prefix
+def is_ignored_namespace(title_norm: str):
-    if name.startswith("category_"):
+    return title_norm.startswith(IGNORED_PREFIXES)
        alt = name.replace("category_", "", 1)
        if alt in title_registry:
            return alt
    return None
 # -------------------------
 # Extract article content
 # -------------------------
 def extract_article_links(soup):
@ -52,33 +104,26 @@ def extract_article_links(soup):
    links = []
-    for a in content.find_all("a", href=True):
+    for a in content.select("a[href]"):
-        href = a["href"]
+        # ignore navboxes / metadata
-
+        if a.find_parent(class_="navbox"):
        # ignore anchors
        if href.startswith("#"):
            continue
        # ignore files/images/history/etc
        if any(prefix in href.lower() for prefix in [
            "file_",
            "image:",
            "special:",
            "action=",
        ]):
            continue
        href = a.get("href")
        links.append(href)
    return links
-# ======================
+# --------------------------------------------------
-# MAIN
+# MAIN SCAN
-# ======================
+# --------------------------------------------------
-files = list(INPUT_DIR.glob("*.html"))
+resolved_links = []
 unresolved_links = []
 files = list(PAGES_DIR.glob("*.html"))
 print(f"{len(files)} pages à analyser")
 for i, file_path in enumerate(files, 1):
@ -90,33 +135,50 @@ for i, file_path in enumerate(files, 1):
    for href in links:
-        key = normalize_href(href)
+        raw_target = extract_mediawiki_target(href)
-        if not key:
+        norm = normalize_title(raw_target)
        if not norm:
            continue
-        resolved = resolve(key)
+        if is_ignored_namespace(norm):
            continue
        entry = {
            "source": file_path.name,
-            "link": href,
+            "href": href,
            "normalized": norm,
        }
        resolved = equivalences.get(norm)
        if resolved:
-            entry["target"] = resolved
+            entry["resolved_title"] = resolved
-            OUTPUT_RESOLVED.append(entry)
+            resolved_links.append(entry)
        else:
-            OUTPUT_UNRESOLVED.append(entry)
+            unresolved_links.append(entry)
    if i % 100 == 0:
        print(f"{i}/{len(files)} analysées")
-# ======================
+# --------------------------------------------------
-# SAVE
+# SAVE RESULTS
-# ======================
+# --------------------------------------------------
-json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
+json.dump(
-json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
+    resolved_links,
    open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
 )
 json.dump(
    unresolved_links,
    open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
    indent=2,
    ensure_ascii=False,
 )
 print("\n✅ LINK SCAN COMPLETE")
-print("Resolved:", len(OUTPUT_RESOLVED))
+print("Resolved:", len(resolved_links))
-print("Unresolved:", len(OUTPUT_UNRESOLVED))
+print("Unresolved:", len(unresolved_links))