whu_migration_scripts/prepare_pages_and_registry.py
2026-04-10 14:39:10 +02:00

590 lines
No EOL
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import json
import shutil
import html
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
from bs4 import BeautifulSoup
import unicodedata
SOURCE_DIR = Path("../original_index")
OUTPUT_DIR = Path("../output")
PAGES_DIR = Path(OUTPUT_DIR / "pages")
REGISTRY_PATH = Path(OUTPUT_DIR / "equivalence_registry.json")
REPORT_PATH = Path(OUTPUT_DIR / "migration_report.txt")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
PAGES_DIR.mkdir(parents=True, exist_ok=True)
# --------------------------------------------------
# Helpers
# --------------------------------------------------
INVALID_WIN_CHARS = r'[<>:"/\\|?*]'
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
ERROR_PAGE_PATTERNS = [
# 5xx
"503 service unavailable",
"502 bad gateway",
"500 internal server error",
"504 gateway time",
# 4xx
"400 bad request",
"401 unauthorized",
"403 forbidden",
"404 not found",
"408 request time",
"419 page expired",
"429 too many requests",
# génériques
"temporarily busy",
"server error",
"internal error",
"page not found",
"request could not be satisfied",
]
LEAGUE_PREFIX_RE = re.compile(r"^league model\s*-\s*", re.I)
def strip_league_prefix(title: str) -> str:
title = normalize_title(title)
return LEAGUE_PREFIX_RE.sub("", title).strip()
def is_league_title(title: str) -> bool:
return bool(LEAGUE_PREFIX_RE.match(normalize_title(title)))
def is_error_page(page_html: str) -> bool:
text = page_html.lower()
if "<html" not in text:
return False
for pattern in ERROR_PAGE_PATTERNS:
if pattern in text:
return True
return False
def decode_mediawiki_string(s: str) -> str:
if not s:
return s
# 1 — HTML entities
s = html.unescape(s)
# 2 — decode ONLY \uXXXX sequences (safe)
def repl(m):
return chr(int(m.group(1), 16))
s = UNICODE_ESCAPE_RE.sub(repl, s)
return s
def similarity(a, b):
return SequenceMatcher(None, a, b).ratio()
def normalize_title(title: str) -> str:
title = title.strip()
title = unicodedata.normalize("NFKC", title)
title = title.replace("_", " ")
title = title.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
title = re.sub(r"\s+", " ", title)
return title.casefold()
def sanitize_filename(name: str) -> str:
name = re.sub(INVALID_WIN_CHARS, "_", name)
return name[:180]
def extract_wg_page_name(page_html: str) -> str | None:
m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html)
if m:
return decode_mediawiki_string(m.group(1)).replace("_", " ")
return None
def extract_page_identity(page_html: str):
page = extract_wg_page_name(page_html)
if page:
return page
m = re.search(r"<title>(.*?) -", page_html, re.I)
if m:
return decode_mediawiki_string(m.group(1))
return None
def extract_article_id(page_html: str) -> int | None:
m = ARTICLE_ID_RE.search(page_html)
if m:
aid = int(m.group(1))
if aid > 0:
return aid
return None
def extract_internal_redirect(page_html: str):
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
if m:
return decode_mediawiki_string(m.group(1)).replace("_", " ")
return None
def extract_namespace(page_html: str) -> str:
m = NAMESPACE_RE.search(page_html)
if m:
return m.group(1)
return ""
def extract_wg_title(page_html):
m = WG_TITLE_RE.search(page_html)
if m:
return decode_mediawiki_string(m.group(1))
return None
def normalize_reference_key(key: str) -> str:
key = normalize_title(key)
# normalise namespace category
key = re.sub(r"^category[\s:_]+", "", key)
# normalise les apostrophes typographiques → ascii
key = key.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
# collapse espaces
key = re.sub(r"\s+", " ", key)
return key.strip()
def has_editorial_content(html_page: str) -> bool:
soup = BeautifulSoup(html_page, "html.parser")
content = soup.find(id="mw-content-text")
if not content:
return False
auto = content.select_one(".mw-category-generated")
if not auto:
return True # pas une catégorie auto
# texte AVANT le listing
editorial_text = ""
for child in content.children:
if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []):
break
editorial_text += child.get_text(" ", strip=True)
editorial_text = editorial_text.strip()
return len(editorial_text) > 200
# --------------------------------------------------
# Registry structures
# --------------------------------------------------
ignored_pages = []
problems = []
redirects = {}
all_variants = defaultdict(list)
files = list(SOURCE_DIR.glob("*.html"))
print(f"{len(files)} fichiers trouvés")
# --------------------------------------------------
# PASS 1 — analyse et collecte des variantes
# --------------------------------------------------
category_redirects = {}
for i, path in enumerate(files, 1):
try:
page_html = path.read_text(encoding="utf-8", errors="replace")
article_id = extract_article_id(page_html)
if not article_id:
if not is_error_page(page_html):
ignored_pages.append(path.name)
continue
fake_title = normalize_title(path.stem)
all_variants[f"error::{fake_title}"].append({
"path": path,
"title": fake_title,
"canonical_key": fake_title,
"article_id": None,
"wg_title": None,
"redirect": False,
"is_category": False,
"is_listing_only": False,
"is_error": True,
})
problems.append(f"Error page detected: {path.name}")
continue
title = extract_page_identity(page_html)
if not title:
problems.append(f"No title: {path}")
continue
ns = extract_namespace(page_html)
# Ignorer certains namespaces
if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
ignored_pages.append(path.name)
continue
title = decode_mediawiki_string(title)
norm = normalize_title(title)
page_name = extract_wg_page_name(page_html)
full_title = normalize_title(page_name) if page_name else norm
base_title = norm
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
is_category = ns == "Category" or norm.startswith("category:")
has_content = has_editorial_content(page_html)
is_listing_only = is_category and not has_content
wg_title = extract_wg_title(page_html)
# Categories
if ns == "Category":
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
if m_title:
wg_title = decode_mediawiki_string(m_title.group(1))
cat_base = normalize_title(wg_title)
page_norm = normalize_title(page_name) if page_name else None
if page_norm and page_norm != f"category:{cat_base}":
# page réelle déguisée en category
base_title = page_norm
is_category = False
else:
base_title = cat_base
is_category = True
else:
base_title = norm.replace("category:", "", 1)
is_category = True
# redirect interne
redir = extract_internal_redirect(page_html)
if redir:
key = full_title
target = normalize_title(redir)
if is_listing_only or is_category:
category_redirects[key] = target
else:
redirects[key] = target
is_league = is_league_title(full_title)
base_no_league = strip_league_prefix(full_title)
canonical_key = normalize_reference_key(full_title)
all_variants[article_id].append({
"path": path,
"title": base_title,
"canonical_key": canonical_key,
"article_id": article_id,
"wg_title": normalize_title(wg_title) if wg_title else None,
"redirect": is_redirect,
"is_category": is_category,
"is_listing_only": is_listing_only,
"is_error": False,
"is_league": is_league,
"base_no_league": base_no_league,
})
except Exception as e:
problems.append(f"{path}: {e}")
if i % 200 == 0:
print(f"{i}/{len(files)} analysés")
print("Variants collected:", len(all_variants))
print("Added category_redirect from category/listing:", len(category_redirects))
# --------------------------------------------------
# PASS 2 — choix des versions canoniques
# --------------------------------------------------
canonical_pages = {}
potential_tags = defaultdict(list)
equivalences = {}
category_renamed = 0
category_not_chosen = 0
error_pages = []
def slug_to_title(filename: str) -> str:
name = Path(filename).stem
name = re.sub(r"\d+$", "", name)
return normalize_title(name)
def filename_similarity_score(filename, wg_title):
if not wg_title:
return 0
filename = normalize_title(filename)
wg_title = normalize_title(wg_title)
# enlève chiffres suffixes
filename = re.sub(r"\d+$", "", filename)
return similarity(filename, wg_title)
def variant_score(v):
filename = v["path"].stem
filename_norm = normalize_title(filename)
similarity_score = filename_similarity_score(
filename_norm,
v["wg_title"]
)
is_short_slug = bool(
SHORT_SLUG_RE.match(filename_norm.replace(" ", ""))
)
long_title_penalty = (
"," in filename or
"_" in filename or
len(filename) > 40
)
league_penalty = v.get("is_league", False)
if v.get("is_error"):
return (True, True, True, True, True, 0, 9999, "zzz")
return (
v["is_listing_only"],
v["redirect"],
league_penalty,
not is_short_slug,
long_title_penalty,
-similarity_score,
len(filename),
filename.lower(),
)
def add_equivalence(k, v):
k = normalize_reference_key(k)
v = normalize_reference_key(v)
if k != v:
if v not in [d["title"] for d in canonical_pages.values()]:
print("⚠️ Adding equivalence to NON-CANONICAL value:", k, "->", v)
equivalences[k] = v
for article_id, variants in all_variants.items():
variants_sorted = sorted(variants, key=variant_score)
chosen = variants_sorted[0]
canonical_slug = normalize_reference_key(strip_league_prefix(chosen["path"].stem))
# categories listing-only
if chosen["is_listing_only"]:
tag_name = normalize_reference_key(chosen["title"])
for v in variants:
potential_tags[tag_name].append(normalize_title(v["path"].stem))
if v["wg_title"]:
potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
continue
if all(v.get("is_error") for v in variants):
chosen_variant = variants[0]
canonical_pages[article_id] = {
"path": chosen_variant["path"],
"title": normalize_reference_key(chosen_variant["path"].stem),
"redirect": False,
}
error_pages.append(chosen_variant["path"].name)
continue
canonical_pages[article_id] = {
"path": chosen["path"],
"title": canonical_slug,
"redirect": chosen["redirect"],
}
if chosen.get("is_error"):
error_pages.append(chosen["path"].name)
for v in variants:
if v["is_category"] and not v["is_listing_only"]:
# catégorie non choisie
if v is not chosen:
category_not_chosen += 1
# catégorie choisie mais qui est une category_* → renommée
elif chosen["path"].stem.lower().startswith("category"):
category_renamed += 1
if v is not chosen:
filename_key = normalize_title(Path(v["path"]).stem)
add_equivalence(filename_key, canonical_slug)
if v.get("is_league"):
league_key = normalize_reference_key(v["canonical_key"])
base_key = normalize_reference_key(v["base_no_league"])
add_equivalence(league_key, base_key)
print(f"{len(canonical_pages)} pages canoniques")
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
print(f"{category_renamed} pages prefix 'category_*' renommées")
print(f"{len(potential_tags)} potential_tags enregistrés")
print(f"{len(error_pages)} error_pages détectées")
# --------------------------------------------------
# PASS 3 — resolve redirects
# --------------------------------------------------
def resolve_redirect(key):
seen = set()
while key in redirects and key not in seen:
seen.add(key)
key = redirects[key]
return key
def resolve_all(key):
seen = set()
while key not in seen:
seen.add(key)
if key in redirects:
key = redirects[key]
continue
if key in equivalences:
key = equivalences[key]
continue
break
return key
skipped_redirect = 0
valid_titles = {
data["title"]
for data in canonical_pages.values()
}
for k, v in {**redirects, **category_redirects}.items():
if k == v:
continue
final = resolve_all(v)
if final in valid_titles and k != final:
equivalences[k] = final
else:
skipped_redirect += 1
print(f"Skipped redirect to non-canonical: {skipped_redirect}")
# --------------------------------------------------
# PASS 4 — normalisation finale des equivalences
# --------------------------------------------------
def resolve_equivalence(key):
seen = set()
while key in equivalences and key not in seen:
seen.add(key)
key = equivalences[key]
return key
clean_equivalences = {}
for k, v in equivalences.items():
final = resolve_equivalence(v)
if final in valid_titles and k != final:
clean_equivalences[k] = final
else:
if final not in valid_titles:
problems.append(f"Non canonical mapping: {k} -> {final}")
equivalences = clean_equivalences
print(f"Equivalences kept: {len(equivalences)}")
# --------------------------------------------------
# PASS 5 — copie des pages canoniques
# --------------------------------------------------
def title_to_filename(title: str) -> str:
return sanitize_filename(
title.replace(" ", "_").replace("", "'").replace("", "'").replace("", '"').replace("", '"').casefold() + ".html"
)
copied = 0
total = len(canonical_pages)
for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
src = data["path"]
dst_name = title_to_filename(data["title"])
dst = PAGES_DIR / dst_name
try:
shutil.copy2(src, dst)
canonical_pages[article_id] = dst_name
copied += 1
except Exception as e:
problems.append(f"Copy failed {src}: {e}")
if i % 200 == 0 or i == total:
print(f"{i}/{total} copiés")
print(f"{copied} pages copiées")
# --------------------------------------------------
# SAVE REGISTRY
# --------------------------------------------------
registry = {
"canonical_pages": canonical_pages,
"equivalences": equivalences,
"potential_tags": potential_tags,
"ignored_pages": ignored_pages,
"error_pages": error_pages,
}
REGISTRY_PATH.parent.mkdir(exist_ok=True)
with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
json.dump(registry, f, indent=2, ensure_ascii=False)
# --------------------------------------------------
# REPORT
# --------------------------------------------------
with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write("=== MIGRATION REPORT ===\n")
f.write(f"Canonical pages: {len(canonical_pages)}\n")
f.write(f"Equivalences: {len(equivalences)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n")
f.write(f"Error pages: {len(error_pages)}\n")
for p in problems:
f.write(p + "\n")
print("\n✅ PREPARATION COMPLETE")