From c7b45432b15b32abae4770efa747e96e02452b71 Mon Sep 17 00:00:00 2001 From: maximator Date: Fri, 3 Apr 2026 23:42:54 +0200 Subject: [PATCH] mapping before dedup --- prepare_pages_and_registry.py | 252 ++++++++++++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 prepare_pages_and_registry.py diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py new file mode 100644 index 0000000..4b0ac16 --- /dev/null +++ b/prepare_pages_and_registry.py @@ -0,0 +1,252 @@ +import os +import re +import json +import shutil +import html +from pathlib import Path +from collections import defaultdict + +SOURCE_DIR = Path("../original_index") +OUTPUT_DIR = Path("../output") + +PAGES_DIR = Path(OUTPUT_DIR / "pages") +REGISTRY_PATH = Path(OUTPUT_DIR / "equivalence_registry.json") +REPORT_PATH = Path(OUTPUT_DIR / "migration_report.txt") + +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +PAGES_DIR.mkdir(parents=True, exist_ok=True) + +# -------------------------------------------------- +# Helpers +# -------------------------------------------------- + +INVALID_WIN_CHARS = r'[<>:"/\\|?*]' +ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)') +IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true') +NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"') + +def normalize_title(title: str) -> str: + title = title.strip() + title = title.replace("_", " ") + title = re.sub(r"\s+", " ", title) + return title.casefold() + + +def sanitize_filename(name: str) -> str: + name = re.sub(INVALID_WIN_CHARS, "_", name) + return name[:180] + + +def extract_wg_page_name(page_html: str) -> str | None: + m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html) + if m: + return html.unescape(m.group(1)).replace("_", " ") + return None + + +def extract_page_identity(html: str): + page = extract_wg_page_name(html) + if page: + return page + + # fallback title tag + m = re.search(r"(.*?) -", html, re.I) + if m: + return html.unescape(m.group(1)) + + return None + + +def extract_article_id(html: str) -> int | None: + m = ARTICLE_ID_RE.search(html) + if m: + aid = int(m.group(1)) + if aid > 0: + return aid + return None + + +def extract_redirect(html: str) -> str | None: + m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I) + if m: + return m.group(1).strip() + return None + + +def namespace_of(title: str): + if ":" in title: + return title.split(":", 1)[0] + + +def extract_internal_redirect(page_html: str): + m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html) + if m: + return html.unescape(m.group(1)).replace("_", " ") + return None + + +def extract_namespace(html: str) -> str: + m = NAMESPACE_RE.search(html) + if m: + return m.group(1) + return "" + +# -------------------------------------------------- +# Registry structures +# -------------------------------------------------- + +canonical_pages = {} +equivalences = {} +redirects = {} +ignored_pages = [] +problems = [] + +files = list(SOURCE_DIR.glob("*.html")) +print(f"{len(files)} fichiers trouvés") + + +# -------------------------------------------------- +# PASS 1 — analyse +# -------------------------------------------------- + +for i, path in enumerate(files, 1): + + try: + page_html = path.read_text(encoding="utf-8", errors="ignore") + + article_id = extract_article_id(page_html) + if not article_id: + ignored_pages.append(path.name) + continue + + title = extract_page_identity(page_html) + if not title: + problems.append(f"No title: {path}") + continue + + ns = extract_namespace(page_html) + + # ------------------------- + # Ignore namespaces + # ------------------------- + if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"): + ignored_pages.append(path.name) + continue + + title = html.unescape(title) + norm = normalize_title(title) + + + # ------------------------- + # Category pages + # ------------------------- + # Category pages CAN be canonical content + if ns == "Category": + norm = normalize_title(title) + equivalences[norm] = norm + + # ------------------------- + # Redirect detection + # ------------------------- + redir = extract_internal_redirect(page_html) + if redir: + redirects[normalize_title(redir)] = norm + # ------------------------- + # Canonical article + # ------------------------- + + is_redirect = bool(IS_REDIRECT_RE.search(page_html)) + if article_id not in canonical_pages: + canonical_pages[article_id] = { + "path": path, + "title": norm, + "redirect": is_redirect, + } + elif canonical_pages[article_id]["redirect"] and not is_redirect: + canonical_pages[article_id] = { + "path": path, + "title": norm, + "redirect": is_redirect, + } + # self equivalence + equivalences[norm] = norm + + except Exception as e: + problems.append(f"{path}: {e}") + + if i % 200 == 0: + print(f"{i}/{len(files)} analysés") + + +# -------------------------------------------------- +# PASS 2 — resolve redirects +# -------------------------------------------------- + +def resolve_redirect(key): + seen = set() + while key in redirects and key not in seen: + seen.add(key) + key = redirects[key] + return key + + +for k, v in list(redirects.items()): + equivalences[k] = resolve_redirect(v) + + +# -------------------------------------------------- +# PASS 3 — copy canonical pages +# -------------------------------------------------- + +copied = 0 + +# for key, src in canonical_pages.items(): +for key, data in canonical_pages.items(): + + src = data["path"] + dst_name = sanitize_filename(src.name) + dst = PAGES_DIR / dst_name + + try: + shutil.copy2(src, dst) + canonical_pages[key] = dst_name + copied += 1 + except Exception as e: + problems.append(f"Copy failed {src}: {e}") + +print(f"{copied} pages copiées") + + +# -------------------------------------------------- +# SAVE REGISTRY +# -------------------------------------------------- + +registry = { + "canonical_pages": canonical_pages, + "equivalences": equivalences, + "redirects": redirects, + "ignored_pages": ignored_pages, +} + +REGISTRY_PATH.parent.mkdir(exist_ok=True) + +with open(REGISTRY_PATH, "w", encoding="utf-8") as f: + json.dump(registry, f, indent=2, ensure_ascii=False) + + +# -------------------------------------------------- +# REPORT +# -------------------------------------------------- + +with open(REPORT_PATH, "w", encoding="utf-8") as f: + f.write("=== MIGRATION REPORT ===\n") + f.write(f"Canonical pages: {len(canonical_pages)}\n") + f.write(f"Equivalences: {len(equivalences)}\n") + f.write(f"Redirects: {len(redirects)}\n") + f.write(f"Ignored: {len(ignored_pages)}\n") + f.write(f"Problems: {len(problems)}\n\n") + + for p in problems[:200]: + f.write(p + "\n") + +print("\n✅ PREPARATION COMPLETE") \ No newline at end of file