import os import re import json import shutil import html from pathlib import Path from collections import defaultdict SOURCE_DIR = Path("../original_index") OUTPUT_DIR = Path("../output") PAGES_DIR = Path(OUTPUT_DIR / "pages") REGISTRY_PATH = Path(OUTPUT_DIR / "equivalence_registry.json") REPORT_PATH = Path(OUTPUT_DIR / "migration_report.txt") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) PAGES_DIR.mkdir(parents=True, exist_ok=True) # -------------------------------------------------- # Helpers # -------------------------------------------------- INVALID_WIN_CHARS = r'[<>:"/\\|?*]' ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)') IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true') NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"') def normalize_title(title: str) -> str: title = title.strip() title = title.replace("_", " ") title = re.sub(r"\s+", " ", title) return title.casefold() def sanitize_filename(name: str) -> str: name = re.sub(INVALID_WIN_CHARS, "_", name) return name[:180] def extract_wg_page_name(page_html: str) -> str | None: m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html) if m: return html.unescape(m.group(1)).replace("_", " ") return None def extract_page_identity(html: str): page = extract_wg_page_name(html) if page: return page # fallback title tag m = re.search(r"(.*?) -", html, re.I) if m: return html.unescape(m.group(1)) return None def extract_article_id(html: str) -> int | None: m = ARTICLE_ID_RE.search(html) if m: aid = int(m.group(1)) if aid > 0: return aid return None def extract_redirect(html: str) -> str | None: m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I) if m: return m.group(1).strip() return None def namespace_of(title: str): if ":" in title: return title.split(":", 1)[0] def extract_internal_redirect(page_html: str): m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html) if m: return html.unescape(m.group(1)).replace("_", " ") return None def extract_namespace(html: str) -> str: m = NAMESPACE_RE.search(html) if m: return m.group(1) return "" # -------------------------------------------------- # Registry structures # -------------------------------------------------- canonical_pages = {} equivalences = {} redirects = {} ignored_pages = [] problems = [] files = list(SOURCE_DIR.glob("*.html")) print(f"{len(files)} fichiers trouvés") # -------------------------------------------------- # PASS 1 — analyse # -------------------------------------------------- for i, path in enumerate(files, 1): try: page_html = path.read_text(encoding="utf-8", errors="ignore") article_id = extract_article_id(page_html) if not article_id: ignored_pages.append(path.name) continue title = extract_page_identity(page_html) if not title: problems.append(f"No title: {path}") continue ns = extract_namespace(page_html) # ------------------------- # Ignore namespaces # ------------------------- if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"): ignored_pages.append(path.name) continue title = html.unescape(title) norm = normalize_title(title) # ------------------------- # Category pages # ------------------------- # Category pages CAN be canonical content if ns == "Category": norm = normalize_title(title) equivalences[norm] = norm # ------------------------- # Redirect detection # ------------------------- redir = extract_internal_redirect(page_html) if redir: redirects[normalize_title(redir)] = norm # ------------------------- # Canonical article # ------------------------- is_redirect = bool(IS_REDIRECT_RE.search(page_html)) if article_id not in canonical_pages: canonical_pages[article_id] = { "path": path, "title": norm, "redirect": is_redirect, } elif canonical_pages[article_id]["redirect"] and not is_redirect: canonical_pages[article_id] = { "path": path, "title": norm, "redirect": is_redirect, } # self equivalence equivalences[norm] = norm except Exception as e: problems.append(f"{path}: {e}") if i % 200 == 0: print(f"{i}/{len(files)} analysés") # -------------------------------------------------- # PASS 2 — resolve redirects # -------------------------------------------------- def resolve_redirect(key): seen = set() while key in redirects and key not in seen: seen.add(key) key = redirects[key] return key for k, v in list(redirects.items()): equivalences[k] = resolve_redirect(v) # -------------------------------------------------- # PASS 3 — copy canonical pages # -------------------------------------------------- copied = 0 # for key, src in canonical_pages.items(): for key, data in canonical_pages.items(): src = data["path"] dst_name = sanitize_filename(src.name) dst = PAGES_DIR / dst_name try: shutil.copy2(src, dst) canonical_pages[key] = dst_name copied += 1 except Exception as e: problems.append(f"Copy failed {src}: {e}") print(f"{copied} pages copiées") # -------------------------------------------------- # SAVE REGISTRY # -------------------------------------------------- registry = { "canonical_pages": canonical_pages, "equivalences": equivalences, "redirects": redirects, "ignored_pages": ignored_pages, } REGISTRY_PATH.parent.mkdir(exist_ok=True) with open(REGISTRY_PATH, "w", encoding="utf-8") as f: json.dump(registry, f, indent=2, ensure_ascii=False) # -------------------------------------------------- # REPORT # -------------------------------------------------- with open(REPORT_PATH, "w", encoding="utf-8") as f: f.write("=== MIGRATION REPORT ===\n") f.write(f"Canonical pages: {len(canonical_pages)}\n") f.write(f"Equivalences: {len(equivalences)}\n") f.write(f"Redirects: {len(redirects)}\n") f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Problems: {len(problems)}\n\n") for p in problems[:200]: f.write(p + "\n") print("\n✅ PREPARATION COMPLETE")