diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py
new file mode 100644
index 0000000..4b0ac16
--- /dev/null
+++ b/prepare_pages_and_registry.py
@@ -0,0 +1,252 @@
+import os
+import re
+import json
+import shutil
+import html
+from pathlib import Path
+from collections import defaultdict
+
+SOURCE_DIR = Path("../original_index")
+OUTPUT_DIR = Path("../output")
+
+PAGES_DIR = Path(OUTPUT_DIR / "pages")
+REGISTRY_PATH = Path(OUTPUT_DIR / "equivalence_registry.json")
+REPORT_PATH = Path(OUTPUT_DIR / "migration_report.txt")
+
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+PAGES_DIR.mkdir(parents=True, exist_ok=True)
+
+# --------------------------------------------------
+# Helpers
+# --------------------------------------------------
+
+INVALID_WIN_CHARS = r'[<>:"/\\|?*]'
+ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
+IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
+NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
+
+def normalize_title(title: str) -> str:
+ title = title.strip()
+ title = title.replace("_", " ")
+ title = re.sub(r"\s+", " ", title)
+ return title.casefold()
+
+
+def sanitize_filename(name: str) -> str:
+ name = re.sub(INVALID_WIN_CHARS, "_", name)
+ return name[:180]
+
+
+def extract_wg_page_name(page_html: str) -> str | None:
+ m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html)
+ if m:
+ return html.unescape(m.group(1)).replace("_", " ")
+ return None
+
+
+def extract_page_identity(html: str):
+ page = extract_wg_page_name(html)
+ if page:
+ return page
+
+ # fallback title tag
+ m = re.search(r"
(.*?) -", html, re.I)
+ if m:
+ return html.unescape(m.group(1))
+
+ return None
+
+
+def extract_article_id(html: str) -> int | None:
+ m = ARTICLE_ID_RE.search(html)
+ if m:
+ aid = int(m.group(1))
+ if aid > 0:
+ return aid
+ return None
+
+
+def extract_redirect(html: str) -> str | None:
+ m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I)
+ if m:
+ return m.group(1).strip()
+ return None
+
+
+def namespace_of(title: str):
+ if ":" in title:
+ return title.split(":", 1)[0]
+
+
+def extract_internal_redirect(page_html: str):
+ m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
+ if m:
+ return html.unescape(m.group(1)).replace("_", " ")
+ return None
+
+
+def extract_namespace(html: str) -> str:
+ m = NAMESPACE_RE.search(html)
+ if m:
+ return m.group(1)
+ return ""
+
+# --------------------------------------------------
+# Registry structures
+# --------------------------------------------------
+
+canonical_pages = {}
+equivalences = {}
+redirects = {}
+ignored_pages = []
+problems = []
+
+files = list(SOURCE_DIR.glob("*.html"))
+print(f"{len(files)} fichiers trouvés")
+
+
+# --------------------------------------------------
+# PASS 1 — analyse
+# --------------------------------------------------
+
+for i, path in enumerate(files, 1):
+
+ try:
+ page_html = path.read_text(encoding="utf-8", errors="ignore")
+
+ article_id = extract_article_id(page_html)
+ if not article_id:
+ ignored_pages.append(path.name)
+ continue
+
+ title = extract_page_identity(page_html)
+ if not title:
+ problems.append(f"No title: {path}")
+ continue
+
+ ns = extract_namespace(page_html)
+
+ # -------------------------
+ # Ignore namespaces
+ # -------------------------
+ if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
+ ignored_pages.append(path.name)
+ continue
+
+ title = html.unescape(title)
+ norm = normalize_title(title)
+
+
+ # -------------------------
+ # Category pages
+ # -------------------------
+ # Category pages CAN be canonical content
+ if ns == "Category":
+ norm = normalize_title(title)
+ equivalences[norm] = norm
+
+ # -------------------------
+ # Redirect detection
+ # -------------------------
+ redir = extract_internal_redirect(page_html)
+ if redir:
+ redirects[normalize_title(redir)] = norm
+ # -------------------------
+ # Canonical article
+ # -------------------------
+
+ is_redirect = bool(IS_REDIRECT_RE.search(page_html))
+ if article_id not in canonical_pages:
+ canonical_pages[article_id] = {
+ "path": path,
+ "title": norm,
+ "redirect": is_redirect,
+ }
+ elif canonical_pages[article_id]["redirect"] and not is_redirect:
+ canonical_pages[article_id] = {
+ "path": path,
+ "title": norm,
+ "redirect": is_redirect,
+ }
+ # self equivalence
+ equivalences[norm] = norm
+
+ except Exception as e:
+ problems.append(f"{path}: {e}")
+
+ if i % 200 == 0:
+ print(f"{i}/{len(files)} analysés")
+
+
+# --------------------------------------------------
+# PASS 2 — resolve redirects
+# --------------------------------------------------
+
+def resolve_redirect(key):
+ seen = set()
+ while key in redirects and key not in seen:
+ seen.add(key)
+ key = redirects[key]
+ return key
+
+
+for k, v in list(redirects.items()):
+ equivalences[k] = resolve_redirect(v)
+
+
+# --------------------------------------------------
+# PASS 3 — copy canonical pages
+# --------------------------------------------------
+
+copied = 0
+
+# for key, src in canonical_pages.items():
+for key, data in canonical_pages.items():
+
+ src = data["path"]
+ dst_name = sanitize_filename(src.name)
+ dst = PAGES_DIR / dst_name
+
+ try:
+ shutil.copy2(src, dst)
+ canonical_pages[key] = dst_name
+ copied += 1
+ except Exception as e:
+ problems.append(f"Copy failed {src}: {e}")
+
+print(f"{copied} pages copiées")
+
+
+# --------------------------------------------------
+# SAVE REGISTRY
+# --------------------------------------------------
+
+registry = {
+ "canonical_pages": canonical_pages,
+ "equivalences": equivalences,
+ "redirects": redirects,
+ "ignored_pages": ignored_pages,
+}
+
+REGISTRY_PATH.parent.mkdir(exist_ok=True)
+
+with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
+ json.dump(registry, f, indent=2, ensure_ascii=False)
+
+
+# --------------------------------------------------
+# REPORT
+# --------------------------------------------------
+
+with open(REPORT_PATH, "w", encoding="utf-8") as f:
+ f.write("=== MIGRATION REPORT ===\n")
+ f.write(f"Canonical pages: {len(canonical_pages)}\n")
+ f.write(f"Equivalences: {len(equivalences)}\n")
+ f.write(f"Redirects: {len(redirects)}\n")
+ f.write(f"Ignored: {len(ignored_pages)}\n")
+ f.write(f"Problems: {len(problems)}\n\n")
+
+ for p in problems[:200]:
+ f.write(p + "\n")
+
+print("\n✅ PREPARATION COMPLETE")
\ No newline at end of file