whu_migration_scripts/prepare_pages_and_registry.py

590 lines
16 KiB
Python
Raw Normal View History

2026-04-03 23:42:54 +02:00
import os
import re
import json
import shutil
import html
from pathlib import Path
from collections import defaultdict
2026-04-07 15:06:30 +02:00
from difflib import SequenceMatcher
2026-04-09 12:05:15 +02:00
from bs4 import BeautifulSoup
2026-04-10 10:29:21 +02:00
import unicodedata
2026-04-03 23:42:54 +02:00
2026-04-14 15:15:34 +02:00
SOURCE_DIR = Path("../test")
2026-04-03 23:42:54 +02:00
OUTPUT_DIR = Path("../output")
PAGES_DIR = Path(OUTPUT_DIR / "pages")
REGISTRY_PATH = Path(OUTPUT_DIR / "equivalence_registry.json")
REPORT_PATH = Path(OUTPUT_DIR / "migration_report.txt")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
PAGES_DIR.mkdir(parents=True, exist_ok=True)
# --------------------------------------------------
# Helpers
# --------------------------------------------------
INVALID_WIN_CHARS = r'[<>:"/\\|?*]'
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
2026-04-07 15:06:30 +02:00
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
2026-04-10 10:29:21 +02:00
UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
2026-04-10 13:42:42 +02:00
ERROR_PAGE_PATTERNS = [
# 5xx
"503 service unavailable",
"502 bad gateway",
"500 internal server error",
"504 gateway time",
# 4xx
"400 bad request",
"401 unauthorized",
"403 forbidden",
"404 not found",
"408 request time",
"419 page expired",
"429 too many requests",
# génériques
"temporarily busy",
"server error",
"internal error",
"page not found",
"request could not be satisfied",
]
2026-04-10 14:39:10 +02:00
LEAGUE_PREFIX_RE = re.compile(r"^league model\s*-\s*", re.I)
def strip_league_prefix(title: str) -> str:
title = normalize_title(title)
return LEAGUE_PREFIX_RE.sub("", title).strip()
def is_league_title(title: str) -> bool:
return bool(LEAGUE_PREFIX_RE.match(normalize_title(title)))
2026-04-10 13:42:42 +02:00
def is_error_page(page_html: str) -> bool:
text = page_html.lower()
if "<html" not in text:
return False
for pattern in ERROR_PAGE_PATTERNS:
if pattern in text:
return True
return False
2026-04-10 10:29:21 +02:00
def decode_mediawiki_string(s: str) -> str:
if not s:
return s
# 1 — HTML entities
s = html.unescape(s)
# 2 — decode ONLY \uXXXX sequences (safe)
def repl(m):
return chr(int(m.group(1), 16))
s = UNICODE_ESCAPE_RE.sub(repl, s)
return s
2026-04-07 15:06:30 +02:00
def similarity(a, b):
return SequenceMatcher(None, a, b).ratio()
2026-04-03 23:42:54 +02:00
def normalize_title(title: str) -> str:
title = title.strip()
2026-04-10 10:29:21 +02:00
title = unicodedata.normalize("NFKC", title)
2026-04-03 23:42:54 +02:00
title = title.replace("_", " ")
2026-04-10 10:29:21 +02:00
title = title.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
2026-04-03 23:42:54 +02:00
title = re.sub(r"\s+", " ", title)
return title.casefold()
def sanitize_filename(name: str) -> str:
name = re.sub(INVALID_WIN_CHARS, "_", name)
return name[:180]
def extract_wg_page_name(page_html: str) -> str | None:
m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html)
if m:
2026-04-10 10:29:21 +02:00
return decode_mediawiki_string(m.group(1)).replace("_", " ")
2026-04-03 23:42:54 +02:00
return None
2026-04-09 12:05:15 +02:00
def extract_page_identity(page_html: str):
page = extract_wg_page_name(page_html)
2026-04-03 23:42:54 +02:00
if page:
return page
2026-04-09 12:05:15 +02:00
m = re.search(r"<title>(.*?) -", page_html, re.I)
2026-04-03 23:42:54 +02:00
if m:
2026-04-10 10:29:21 +02:00
return decode_mediawiki_string(m.group(1))
2026-04-03 23:42:54 +02:00
return None
2026-04-09 12:05:15 +02:00
def extract_article_id(page_html: str) -> int | None:
m = ARTICLE_ID_RE.search(page_html)
2026-04-03 23:42:54 +02:00
if m:
aid = int(m.group(1))
if aid > 0:
return aid
return None
def extract_internal_redirect(page_html: str):
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
if m:
2026-04-10 10:29:21 +02:00
return decode_mediawiki_string(m.group(1)).replace("_", " ")
2026-04-03 23:42:54 +02:00
return None
2026-04-09 12:05:15 +02:00
def extract_namespace(page_html: str) -> str:
m = NAMESPACE_RE.search(page_html)
2026-04-03 23:42:54 +02:00
if m:
return m.group(1)
return ""
2026-04-04 01:32:39 +02:00
2026-04-07 15:06:30 +02:00
def extract_wg_title(page_html):
m = WG_TITLE_RE.search(page_html)
if m:
2026-04-10 10:29:21 +02:00
return decode_mediawiki_string(m.group(1))
2026-04-07 15:06:30 +02:00
return None
def normalize_reference_key(key: str) -> str:
key = normalize_title(key)
# normalise namespace category
key = re.sub(r"^category[\s:_]+", "", key)
2026-04-10 10:29:21 +02:00
# normalise les apostrophes typographiques → ascii
key = key.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
2026-04-07 15:06:30 +02:00
# collapse espaces
key = re.sub(r"\s+", " ", key)
return key.strip()
2026-04-09 12:05:15 +02:00
def has_editorial_content(html_page: str) -> bool:
soup = BeautifulSoup(html_page, "html.parser")
content = soup.find(id="mw-content-text")
if not content:
return False
auto = content.select_one(".mw-category-generated")
if not auto:
return True # pas une catégorie auto
# texte AVANT le listing
editorial_text = ""
for child in content.children:
if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []):
break
editorial_text += child.get_text(" ", strip=True)
editorial_text = editorial_text.strip()
return len(editorial_text) > 200
2026-04-10 10:29:21 +02:00
2026-04-03 23:42:54 +02:00
# --------------------------------------------------
# Registry structures
# --------------------------------------------------
ignored_pages = []
problems = []
2026-04-04 01:32:39 +02:00
redirects = {}
all_variants = defaultdict(list)
2026-04-03 23:42:54 +02:00
files = list(SOURCE_DIR.glob("*.html"))
print(f"{len(files)} fichiers trouvés")
# --------------------------------------------------
2026-04-04 01:32:39 +02:00
# PASS 1 — analyse et collecte des variantes
2026-04-03 23:42:54 +02:00
# --------------------------------------------------
2026-04-10 10:29:21 +02:00
category_redirects = {}
2026-04-03 23:42:54 +02:00
for i, path in enumerate(files, 1):
try:
2026-04-10 10:29:21 +02:00
page_html = path.read_text(encoding="utf-8", errors="replace")
2026-04-03 23:42:54 +02:00
article_id = extract_article_id(page_html)
if not article_id:
2026-04-10 13:42:42 +02:00
if not is_error_page(page_html):
ignored_pages.append(path.name)
continue
fake_title = normalize_title(path.stem)
all_variants[f"error::{fake_title}"].append({
"path": path,
"title": fake_title,
"canonical_key": fake_title,
"article_id": None,
"wg_title": None,
"redirect": False,
"is_category": False,
"is_listing_only": False,
"is_error": True,
})
problems.append(f"Error page detected: {path.name}")
2026-04-03 23:42:54 +02:00
continue
title = extract_page_identity(page_html)
if not title:
problems.append(f"No title: {path}")
continue
ns = extract_namespace(page_html)
2026-04-04 01:32:39 +02:00
# Ignorer certains namespaces
2026-04-03 23:42:54 +02:00
if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
ignored_pages.append(path.name)
continue
2026-04-10 10:29:21 +02:00
title = decode_mediawiki_string(title)
2026-04-03 23:42:54 +02:00
norm = normalize_title(title)
2026-04-04 02:23:05 +02:00
page_name = extract_wg_page_name(page_html)
full_title = normalize_title(page_name) if page_name else norm
2026-04-04 01:32:39 +02:00
base_title = norm
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
is_category = ns == "Category" or norm.startswith("category:")
2026-04-09 12:05:15 +02:00
has_content = has_editorial_content(page_html)
is_listing_only = is_category and not has_content
2026-04-07 15:06:30 +02:00
wg_title = extract_wg_title(page_html)
2026-04-03 23:42:54 +02:00
2026-04-04 01:32:39 +02:00
# Categories
if ns == "Category":
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
if m_title:
2026-04-10 10:29:21 +02:00
wg_title = decode_mediawiki_string(m_title.group(1))
2026-04-04 01:32:39 +02:00
cat_base = normalize_title(wg_title)
page_norm = normalize_title(page_name) if page_name else None
if page_norm and page_norm != f"category:{cat_base}":
# page réelle déguisée en category
base_title = page_norm
is_category = False
else:
base_title = cat_base
is_category = True
else:
base_title = norm.replace("category:", "", 1)
is_category = True
2026-04-04 02:23:05 +02:00
# redirect interne
redir = extract_internal_redirect(page_html)
if redir:
2026-04-10 10:29:21 +02:00
key = full_title
target = normalize_title(redir)
if is_listing_only or is_category:
category_redirects[key] = target
else:
redirects[key] = target
2026-04-04 02:23:05 +02:00
2026-04-10 14:39:10 +02:00
is_league = is_league_title(full_title)
base_no_league = strip_league_prefix(full_title)
2026-04-07 15:06:30 +02:00
canonical_key = normalize_reference_key(full_title)
2026-04-04 01:32:39 +02:00
all_variants[article_id].append({
"path": path,
"title": base_title,
2026-04-09 12:05:15 +02:00
"canonical_key": canonical_key,
2026-04-04 01:32:39 +02:00
"article_id": article_id,
2026-04-07 15:06:30 +02:00
"wg_title": normalize_title(wg_title) if wg_title else None,
2026-04-04 01:32:39 +02:00
"redirect": is_redirect,
"is_category": is_category,
2026-04-09 12:05:15 +02:00
"is_listing_only": is_listing_only,
2026-04-10 13:42:42 +02:00
"is_error": False,
2026-04-10 14:39:10 +02:00
"is_league": is_league,
"base_no_league": base_no_league,
2026-04-04 01:32:39 +02:00
})
2026-04-03 23:42:54 +02:00
except Exception as e:
problems.append(f"{path}: {e}")
if i % 200 == 0:
print(f"{i}/{len(files)} analysés")
2026-04-07 15:06:30 +02:00
print("Variants collected:", len(all_variants))
2026-04-10 10:29:21 +02:00
print("Added category_redirect from category/listing:", len(category_redirects))
2026-04-03 23:42:54 +02:00
2026-04-04 01:32:39 +02:00
# --------------------------------------------------
# PASS 2 — choix des versions canoniques
# --------------------------------------------------
canonical_pages = {}
2026-04-09 12:05:15 +02:00
potential_tags = defaultdict(list)
2026-04-04 01:32:39 +02:00
equivalences = {}
2026-04-09 12:05:15 +02:00
category_renamed = 0
category_not_chosen = 0
2026-04-10 13:42:42 +02:00
error_pages = []
2026-04-04 01:32:39 +02:00
2026-04-07 15:06:30 +02:00
def slug_to_title(filename: str) -> str:
name = Path(filename).stem
name = re.sub(r"\d+$", "", name)
return normalize_title(name)
def filename_similarity_score(filename, wg_title):
if not wg_title:
return 0
filename = normalize_title(filename)
wg_title = normalize_title(wg_title)
# enlève chiffres suffixes
filename = re.sub(r"\d+$", "", filename)
return similarity(filename, wg_title)
2026-04-04 01:32:39 +02:00
def variant_score(v):
2026-04-07 15:06:30 +02:00
filename = v["path"].stem
filename_norm = normalize_title(filename)
similarity_score = filename_similarity_score(
filename_norm,
v["wg_title"]
)
is_short_slug = bool(
SHORT_SLUG_RE.match(filename_norm.replace(" ", ""))
)
long_title_penalty = (
"," in filename or
"_" in filename or
len(filename) > 40
)
2026-04-10 14:39:10 +02:00
league_penalty = v.get("is_league", False)
2026-04-10 13:42:42 +02:00
if v.get("is_error"):
2026-04-10 14:39:10 +02:00
return (True, True, True, True, True, 0, 9999, "zzz")
2026-04-04 01:32:39 +02:00
return (
2026-04-09 12:05:15 +02:00
v["is_listing_only"],
2026-04-07 15:06:30 +02:00
v["redirect"],
2026-04-10 14:39:10 +02:00
league_penalty,
2026-04-07 15:06:30 +02:00
not is_short_slug,
long_title_penalty,
-similarity_score,
len(filename),
filename.lower(),
2026-04-04 01:32:39 +02:00
)
2026-04-09 12:05:15 +02:00
def add_equivalence(k, v):
k = normalize_reference_key(k)
v = normalize_reference_key(v)
if k != v:
2026-04-10 10:29:21 +02:00
if v not in [d["title"] for d in canonical_pages.values()]:
print("⚠️ Adding equivalence to NON-CANONICAL value:", k, "->", v)
2026-04-09 12:05:15 +02:00
equivalences[k] = v
2026-04-04 01:32:39 +02:00
2026-04-09 12:05:15 +02:00
for article_id, variants in all_variants.items():
variants_sorted = sorted(variants, key=variant_score)
2026-04-04 01:32:39 +02:00
chosen = variants_sorted[0]
2026-04-10 14:39:10 +02:00
canonical_slug = normalize_reference_key(strip_league_prefix(chosen["path"].stem))
2026-04-04 01:32:39 +02:00
2026-04-09 12:05:15 +02:00
# categories listing-only
if chosen["is_listing_only"]:
tag_name = normalize_reference_key(chosen["title"])
for v in variants:
potential_tags[tag_name].append(normalize_title(v["path"].stem))
if v["wg_title"]:
potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
continue
2026-04-04 01:32:39 +02:00
2026-04-10 13:42:42 +02:00
if all(v.get("is_error") for v in variants):
chosen_variant = variants[0]
canonical_pages[article_id] = {
"path": chosen_variant["path"],
"title": normalize_reference_key(chosen_variant["path"].stem),
"redirect": False,
}
error_pages.append(chosen_variant["path"].name)
continue
2026-04-04 01:32:39 +02:00
canonical_pages[article_id] = {
"path": chosen["path"],
2026-04-09 12:05:15 +02:00
"title": canonical_slug,
2026-04-04 01:32:39 +02:00
"redirect": chosen["redirect"],
}
2026-04-10 13:42:42 +02:00
if chosen.get("is_error"):
error_pages.append(chosen["path"].name)
2026-04-07 15:06:30 +02:00
for v in variants:
2026-04-09 12:05:15 +02:00
if v["is_category"] and not v["is_listing_only"]:
# catégorie non choisie
if v is not chosen:
category_not_chosen += 1
# catégorie choisie mais qui est une category_* → renommée
elif chosen["path"].stem.lower().startswith("category"):
category_renamed += 1
if v is not chosen:
filename_key = normalize_title(Path(v["path"]).stem)
add_equivalence(filename_key, canonical_slug)
2026-04-10 14:39:10 +02:00
if v.get("is_league"):
league_key = normalize_reference_key(v["canonical_key"])
base_key = normalize_reference_key(v["base_no_league"])
add_equivalence(league_key, base_key)
2026-04-09 12:05:15 +02:00
print(f"{len(canonical_pages)} pages canoniques")
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
print(f"{category_renamed} pages prefix 'category_*' renommées")
print(f"{len(potential_tags)} potential_tags enregistrés")
2026-04-10 13:42:42 +02:00
print(f"{len(error_pages)} error_pages détectées")
2026-04-03 23:42:54 +02:00
# --------------------------------------------------
2026-04-04 01:32:39 +02:00
# PASS 3 — resolve redirects
2026-04-03 23:42:54 +02:00
# --------------------------------------------------
def resolve_redirect(key):
seen = set()
while key in redirects and key not in seen:
seen.add(key)
key = redirects[key]
return key
2026-04-10 10:29:21 +02:00
def resolve_all(key):
seen = set()
while key not in seen:
seen.add(key)
if key in redirects:
key = redirects[key]
continue
if key in equivalences:
key = equivalences[key]
continue
break
return key
skipped_redirect = 0
valid_titles = {
data["title"]
for data in canonical_pages.values()
}
2026-04-10 11:00:20 +02:00
for k, v in {**redirects, **category_redirects}.items():
2026-04-10 10:29:21 +02:00
if k == v:
continue
final = resolve_all(v)
2026-04-10 11:00:20 +02:00
2026-04-10 10:29:21 +02:00
if final in valid_titles and k != final:
equivalences[k] = final
else:
skipped_redirect += 1
2026-04-03 23:42:54 +02:00
2026-04-10 10:29:21 +02:00
print(f"Skipped redirect to non-canonical: {skipped_redirect}")
2026-04-04 02:23:05 +02:00
2026-04-07 15:06:30 +02:00
# --------------------------------------------------
2026-04-04 02:23:05 +02:00
# PASS 4 — normalisation finale des equivalences
# --------------------------------------------------
2026-04-09 12:05:15 +02:00
def resolve_equivalence(key):
seen = set()
while key in equivalences and key not in seen:
seen.add(key)
key = equivalences[key]
return key
2026-04-10 10:29:21 +02:00
2026-04-10 11:00:20 +02:00
clean_equivalences = {}
2026-04-04 02:23:05 +02:00
for k, v in equivalences.items():
2026-04-10 11:00:20 +02:00
final = resolve_equivalence(v)
if final in valid_titles and k != final:
clean_equivalences[k] = final
else:
if final not in valid_titles:
problems.append(f"Non canonical mapping: {k} -> {final}")
equivalences = clean_equivalences
2026-04-09 12:05:15 +02:00
2026-04-10 10:29:21 +02:00
print(f"Equivalences kept: {len(equivalences)}")
2026-04-09 12:05:15 +02:00
2026-04-03 23:42:54 +02:00
# --------------------------------------------------
2026-04-04 02:23:05 +02:00
# PASS 5 — copie des pages canoniques
2026-04-03 23:42:54 +02:00
# --------------------------------------------------
2026-04-07 15:06:30 +02:00
def title_to_filename(title: str) -> str:
return sanitize_filename(
2026-04-10 10:29:21 +02:00
title.replace(" ", "_").replace("", "'").replace("", "'").replace("", '"').replace("", '"').casefold() + ".html"
2026-04-07 15:06:30 +02:00
)
2026-04-03 23:42:54 +02:00
copied = 0
2026-04-07 15:06:30 +02:00
total = len(canonical_pages)
2026-04-09 12:05:15 +02:00
for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
2026-04-07 15:06:30 +02:00
2026-04-03 23:42:54 +02:00
src = data["path"]
2026-04-09 12:05:15 +02:00
dst_name = title_to_filename(data["title"])
2026-04-03 23:42:54 +02:00
dst = PAGES_DIR / dst_name
2026-04-07 15:06:30 +02:00
2026-04-03 23:42:54 +02:00
try:
shutil.copy2(src, dst)
2026-04-09 12:05:15 +02:00
canonical_pages[article_id] = dst_name
2026-04-03 23:42:54 +02:00
copied += 1
except Exception as e:
problems.append(f"Copy failed {src}: {e}")
2026-04-07 15:06:30 +02:00
if i % 200 == 0 or i == total:
print(f"{i}/{total} copiés")
2026-04-03 23:42:54 +02:00
print(f"{copied} pages copiées")
# --------------------------------------------------
# SAVE REGISTRY
# --------------------------------------------------
registry = {
"canonical_pages": canonical_pages,
"equivalences": equivalences,
2026-04-09 12:05:15 +02:00
"potential_tags": potential_tags,
2026-04-03 23:42:54 +02:00
"ignored_pages": ignored_pages,
2026-04-10 13:42:42 +02:00
"error_pages": error_pages,
2026-04-03 23:42:54 +02:00
}
REGISTRY_PATH.parent.mkdir(exist_ok=True)
with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
json.dump(registry, f, indent=2, ensure_ascii=False)
# --------------------------------------------------
# REPORT
# --------------------------------------------------
with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write("=== MIGRATION REPORT ===\n")
f.write(f"Canonical pages: {len(canonical_pages)}\n")
f.write(f"Equivalences: {len(equivalences)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n")
2026-04-10 13:42:42 +02:00
f.write(f"Error pages: {len(error_pages)}\n")
2026-04-09 12:05:15 +02:00
for p in problems:
2026-04-03 23:42:54 +02:00
f.write(p + "\n")
print("\n✅ PREPARATION COMPLETE")