mapping before dedup

2026-04-03 23:42:54 +02:00 · 2026-04-03 23:42:54 +02:00 · c7b45432b1
commit c7b45432b1
parent e4aaa33137
1 changed files with 252 additions and 0 deletions
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@ -0,0 +1,252 @@
 import os
 import re
 import json
 import shutil
 import html
 from pathlib import Path
 from collections import defaultdict
 SOURCE_DIR = Path("../original_index")
 OUTPUT_DIR = Path("../output")
 PAGES_DIR = Path(OUTPUT_DIR / "pages")
 REGISTRY_PATH = Path(OUTPUT_DIR / "equivalence_registry.json")
 REPORT_PATH = Path(OUTPUT_DIR / "migration_report.txt")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 PAGES_DIR.mkdir(parents=True, exist_ok=True)
 # --------------------------------------------------
 # Helpers
 # --------------------------------------------------
 INVALID_WIN_CHARS = r'[<>:"/\\|?*]'
 ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
 IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
 NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
 def normalize_title(title: str) -> str:
    title = title.strip()
    title = title.replace("_", " ")
    title = re.sub(r"\s+", " ", title)
    return title.casefold()
 def sanitize_filename(name: str) -> str:
    name = re.sub(INVALID_WIN_CHARS, "_", name)
    return name[:180]
 def extract_wg_page_name(page_html: str) -> str | None:
    m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html)
    if m:
        return html.unescape(m.group(1)).replace("_", " ")
    return None
 def extract_page_identity(html: str):
    page = extract_wg_page_name(html)
    if page:
        return page
    # fallback title tag
    m = re.search(r"<title>(.*?) -", html, re.I)
    if m:
        return html.unescape(m.group(1))
    return None
 def extract_article_id(html: str) -> int | None:
    m = ARTICLE_ID_RE.search(html)
    if m:
        aid = int(m.group(1))
        if aid > 0:
            return aid
    return None
 def extract_redirect(html: str) -> str | None:
    m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I)
    if m:
        return m.group(1).strip()
    return None
 def namespace_of(title: str):
    if ":" in title:
        return title.split(":", 1)[0]
 def extract_internal_redirect(page_html: str):
    m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
    if m:
        return html.unescape(m.group(1)).replace("_", " ")
    return None
 def extract_namespace(html: str) -> str:
    m = NAMESPACE_RE.search(html)
    if m:
        return m.group(1)
    return ""
 # --------------------------------------------------
 # Registry structures
 # --------------------------------------------------
 canonical_pages = {}
 equivalences = {}
 redirects = {}
 ignored_pages = []
 problems = []
 files = list(SOURCE_DIR.glob("*.html"))
 print(f"{len(files)} fichiers trouvés")
 # --------------------------------------------------
 # PASS 1 — analyse
 # --------------------------------------------------
 for i, path in enumerate(files, 1):
    try:
        page_html = path.read_text(encoding="utf-8", errors="ignore")
        article_id = extract_article_id(page_html)
        if not article_id:
            ignored_pages.append(path.name)
            continue
        title = extract_page_identity(page_html)
        if not title:
            problems.append(f"No title: {path}")
            continue
        ns = extract_namespace(page_html)
        # -------------------------
        # Ignore namespaces
        # -------------------------
        if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
            ignored_pages.append(path.name)
            continue
        title = html.unescape(title)
        norm = normalize_title(title)
        # -------------------------
        # Category pages
        # -------------------------
        # Category pages CAN be canonical content
        if ns == "Category":
            norm = normalize_title(title)
            equivalences[norm] = norm
        # -------------------------
        # Redirect detection
        # -------------------------
        redir = extract_internal_redirect(page_html)
        if redir:
            redirects[normalize_title(redir)] = norm
        # -------------------------
        # Canonical article
        # -------------------------
        is_redirect = bool(IS_REDIRECT_RE.search(page_html))
        if article_id not in canonical_pages:
            canonical_pages[article_id] = {
                "path": path,
                "title": norm,
                "redirect": is_redirect,
            }
        elif canonical_pages[article_id]["redirect"] and not is_redirect:
            canonical_pages[article_id] = {
                "path": path,
                "title": norm,
                "redirect": is_redirect,
            }
        # self equivalence
        equivalences[norm] = norm
    except Exception as e:
        problems.append(f"{path}: {e}")
    if i % 200 == 0:
        print(f"{i}/{len(files)} analysés")
 # --------------------------------------------------
 # PASS 2 — resolve redirects
 # --------------------------------------------------
 def resolve_redirect(key):
    seen = set()
    while key in redirects and key not in seen:
        seen.add(key)
        key = redirects[key]
    return key
 for k, v in list(redirects.items()):
    equivalences[k] = resolve_redirect(v)
 # --------------------------------------------------
 # PASS 3 — copy canonical pages
 # --------------------------------------------------
 copied = 0
 # for key, src in canonical_pages.items():
 for key, data in canonical_pages.items():
    src = data["path"]
    dst_name = sanitize_filename(src.name)
    dst = PAGES_DIR / dst_name
    try:
        shutil.copy2(src, dst)
        canonical_pages[key] = dst_name
        copied += 1
    except Exception as e:
        problems.append(f"Copy failed {src}: {e}")
 print(f"{copied} pages copiées")
 # --------------------------------------------------
 # SAVE REGISTRY
 # --------------------------------------------------
 registry = {
    "canonical_pages": canonical_pages,
    "equivalences": equivalences,
    "redirects": redirects,
    "ignored_pages": ignored_pages,
 }
 REGISTRY_PATH.parent.mkdir(exist_ok=True)
 with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
    json.dump(registry, f, indent=2, ensure_ascii=False)
 # --------------------------------------------------
 # REPORT
 # --------------------------------------------------
 with open(REPORT_PATH, "w", encoding="utf-8") as f:
    f.write("=== MIGRATION REPORT ===\n")
    f.write(f"Canonical pages: {len(canonical_pages)}\n")
    f.write(f"Equivalences: {len(equivalences)}\n")
    f.write(f"Redirects: {len(redirects)}\n")
    f.write(f"Ignored: {len(ignored_pages)}\n")
    f.write(f"Problems: {len(problems)}\n\n")
    for p in problems[:200]:
        f.write(p + "\n")
 print("\n✅ PREPARATION COMPLETE")