mapping before dedup
This commit is contained in:
parent
e4aaa33137
commit
c7b45432b1
1 changed files with 252 additions and 0 deletions
252
prepare_pages_and_registry.py
Normal file
252
prepare_pages_and_registry.py
Normal file
|
|
@ -0,0 +1,252 @@
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import html
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
SOURCE_DIR = Path("../original_index")
|
||||||
|
OUTPUT_DIR = Path("../output")
|
||||||
|
|
||||||
|
PAGES_DIR = Path(OUTPUT_DIR / "pages")
|
||||||
|
REGISTRY_PATH = Path(OUTPUT_DIR / "equivalence_registry.json")
|
||||||
|
REPORT_PATH = Path(OUTPUT_DIR / "migration_report.txt")
|
||||||
|
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
PAGES_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
INVALID_WIN_CHARS = r'[<>:"/\\|?*]'
|
||||||
|
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
|
||||||
|
IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
|
||||||
|
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
|
||||||
|
|
||||||
|
def normalize_title(title: str) -> str:
|
||||||
|
title = title.strip()
|
||||||
|
title = title.replace("_", " ")
|
||||||
|
title = re.sub(r"\s+", " ", title)
|
||||||
|
return title.casefold()
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_filename(name: str) -> str:
|
||||||
|
name = re.sub(INVALID_WIN_CHARS, "_", name)
|
||||||
|
return name[:180]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_wg_page_name(page_html: str) -> str | None:
|
||||||
|
m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html)
|
||||||
|
if m:
|
||||||
|
return html.unescape(m.group(1)).replace("_", " ")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_page_identity(html: str):
|
||||||
|
page = extract_wg_page_name(html)
|
||||||
|
if page:
|
||||||
|
return page
|
||||||
|
|
||||||
|
# fallback title tag
|
||||||
|
m = re.search(r"<title>(.*?) -", html, re.I)
|
||||||
|
if m:
|
||||||
|
return html.unescape(m.group(1))
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_article_id(html: str) -> int | None:
|
||||||
|
m = ARTICLE_ID_RE.search(html)
|
||||||
|
if m:
|
||||||
|
aid = int(m.group(1))
|
||||||
|
if aid > 0:
|
||||||
|
return aid
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_redirect(html: str) -> str | None:
|
||||||
|
m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def namespace_of(title: str):
|
||||||
|
if ":" in title:
|
||||||
|
return title.split(":", 1)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_internal_redirect(page_html: str):
|
||||||
|
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
|
||||||
|
if m:
|
||||||
|
return html.unescape(m.group(1)).replace("_", " ")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_namespace(html: str) -> str:
|
||||||
|
m = NAMESPACE_RE.search(html)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# Registry structures
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
canonical_pages = {}
|
||||||
|
equivalences = {}
|
||||||
|
redirects = {}
|
||||||
|
ignored_pages = []
|
||||||
|
problems = []
|
||||||
|
|
||||||
|
files = list(SOURCE_DIR.glob("*.html"))
|
||||||
|
print(f"{len(files)} fichiers trouvés")
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# PASS 1 — analyse
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
for i, path in enumerate(files, 1):
|
||||||
|
|
||||||
|
try:
|
||||||
|
page_html = path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
|
||||||
|
article_id = extract_article_id(page_html)
|
||||||
|
if not article_id:
|
||||||
|
ignored_pages.append(path.name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = extract_page_identity(page_html)
|
||||||
|
if not title:
|
||||||
|
problems.append(f"No title: {path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
ns = extract_namespace(page_html)
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Ignore namespaces
|
||||||
|
# -------------------------
|
||||||
|
if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
|
||||||
|
ignored_pages.append(path.name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = html.unescape(title)
|
||||||
|
norm = normalize_title(title)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Category pages
|
||||||
|
# -------------------------
|
||||||
|
# Category pages CAN be canonical content
|
||||||
|
if ns == "Category":
|
||||||
|
norm = normalize_title(title)
|
||||||
|
equivalences[norm] = norm
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Redirect detection
|
||||||
|
# -------------------------
|
||||||
|
redir = extract_internal_redirect(page_html)
|
||||||
|
if redir:
|
||||||
|
redirects[normalize_title(redir)] = norm
|
||||||
|
# -------------------------
|
||||||
|
# Canonical article
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
|
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
||||||
|
if article_id not in canonical_pages:
|
||||||
|
canonical_pages[article_id] = {
|
||||||
|
"path": path,
|
||||||
|
"title": norm,
|
||||||
|
"redirect": is_redirect,
|
||||||
|
}
|
||||||
|
elif canonical_pages[article_id]["redirect"] and not is_redirect:
|
||||||
|
canonical_pages[article_id] = {
|
||||||
|
"path": path,
|
||||||
|
"title": norm,
|
||||||
|
"redirect": is_redirect,
|
||||||
|
}
|
||||||
|
# self equivalence
|
||||||
|
equivalences[norm] = norm
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
problems.append(f"{path}: {e}")
|
||||||
|
|
||||||
|
if i % 200 == 0:
|
||||||
|
print(f"{i}/{len(files)} analysés")
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# PASS 2 — resolve redirects
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
def resolve_redirect(key):
|
||||||
|
seen = set()
|
||||||
|
while key in redirects and key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
key = redirects[key]
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
|
for k, v in list(redirects.items()):
|
||||||
|
equivalences[k] = resolve_redirect(v)
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# PASS 3 — copy canonical pages
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
copied = 0
|
||||||
|
|
||||||
|
# for key, src in canonical_pages.items():
|
||||||
|
for key, data in canonical_pages.items():
|
||||||
|
|
||||||
|
src = data["path"]
|
||||||
|
dst_name = sanitize_filename(src.name)
|
||||||
|
dst = PAGES_DIR / dst_name
|
||||||
|
|
||||||
|
try:
|
||||||
|
shutil.copy2(src, dst)
|
||||||
|
canonical_pages[key] = dst_name
|
||||||
|
copied += 1
|
||||||
|
except Exception as e:
|
||||||
|
problems.append(f"Copy failed {src}: {e}")
|
||||||
|
|
||||||
|
print(f"{copied} pages copiées")
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# SAVE REGISTRY
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
registry = {
|
||||||
|
"canonical_pages": canonical_pages,
|
||||||
|
"equivalences": equivalences,
|
||||||
|
"redirects": redirects,
|
||||||
|
"ignored_pages": ignored_pages,
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTRY_PATH.parent.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(registry, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# REPORT
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
||||||
|
f.write("=== MIGRATION REPORT ===\n")
|
||||||
|
f.write(f"Canonical pages: {len(canonical_pages)}\n")
|
||||||
|
f.write(f"Equivalences: {len(equivalences)}\n")
|
||||||
|
f.write(f"Redirects: {len(redirects)}\n")
|
||||||
|
f.write(f"Ignored: {len(ignored_pages)}\n")
|
||||||
|
f.write(f"Problems: {len(problems)}\n\n")
|
||||||
|
|
||||||
|
for p in problems[:200]:
|
||||||
|
f.write(p + "\n")
|
||||||
|
|
||||||
|
print("\n✅ PREPARATION COMPLETE")
|
||||||
Loading…
Add table
Add a link
Reference in a new issue