fix maaping canonical preference

This commit is contained in:
Maxime Réaux 2026-04-07 15:06:30 +02:00
parent 90dd3cc152
commit 7f019ed98c
2 changed files with 242 additions and 72 deletions

View file

@ -5,6 +5,7 @@ import shutil
import html import html
from pathlib import Path from pathlib import Path
from collections import defaultdict from collections import defaultdict
from difflib import SequenceMatcher
SOURCE_DIR = Path("../original_index") SOURCE_DIR = Path("../original_index")
OUTPUT_DIR = Path("../output") OUTPUT_DIR = Path("../output")
@ -24,6 +25,11 @@ INVALID_WIN_CHARS = r'[<>:"/\\|?*]'
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)') ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true') IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"') NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
def similarity(a, b):
return SequenceMatcher(None, a, b).ratio()
def normalize_title(title: str) -> str: def normalize_title(title: str) -> str:
title = title.strip() title = title.strip()
@ -77,6 +83,25 @@ def extract_namespace(html: str) -> str:
return "" return ""
def extract_wg_title(page_html):
m = WG_TITLE_RE.search(page_html)
if m:
return html.unescape(m.group(1))
return None
def normalize_reference_key(key: str) -> str:
key = normalize_title(key)
# normalise namespace category
key = re.sub(r"^category[\s:_]+", "", key)
# collapse espaces
key = re.sub(r"\s+", " ", key)
return key.strip()
# -------------------------------------------------- # --------------------------------------------------
# Registry structures # Registry structures
# -------------------------------------------------- # --------------------------------------------------
@ -121,6 +146,7 @@ for i, path in enumerate(files, 1):
base_title = norm base_title = norm
is_redirect = bool(IS_REDIRECT_RE.search(page_html)) is_redirect = bool(IS_REDIRECT_RE.search(page_html))
is_category = ns == "Category" or norm.startswith("category:") is_category = ns == "Category" or norm.startswith("category:")
wg_title = extract_wg_title(page_html)
# Categories # Categories
if ns == "Category": if ns == "Category":
@ -148,11 +174,13 @@ for i, path in enumerate(files, 1):
if redir: if redir:
redirects[full_title] = normalize_title(redir) redirects[full_title] = normalize_title(redir)
canonical_key = normalize_reference_key(full_title)
all_variants[article_id].append({ all_variants[article_id].append({
"path": path, "path": path,
"title": base_title, "title": base_title,
"full_title": full_title, "canonical_key": full_title,
"article_id": article_id, "article_id": article_id,
"wg_title": normalize_title(wg_title) if wg_title else None,
"redirect": is_redirect, "redirect": is_redirect,
"is_category": is_category, "is_category": is_category,
}) })
@ -162,6 +190,7 @@ for i, path in enumerate(files, 1):
if i % 200 == 0: if i % 200 == 0:
print(f"{i}/{len(files)} analysés") print(f"{i}/{len(files)} analysés")
print("Variants collected:", len(all_variants))
# -------------------------------------------------- # --------------------------------------------------
# PASS 2 — choix des versions canoniques # PASS 2 — choix des versions canoniques
@ -173,15 +202,52 @@ category_replaced = 0
nb_all_cat = 0 nb_all_cat = 0
def slug_to_title(filename: str) -> str:
name = Path(filename).stem
name = re.sub(r"\d+$", "", name)
return normalize_title(name)
def filename_similarity_score(filename, wg_title):
if not wg_title:
return 0
filename = normalize_title(filename)
wg_title = normalize_title(wg_title)
# enlève chiffres suffixes
filename = re.sub(r"\d+$", "", filename)
return similarity(filename, wg_title)
def variant_score(v): def variant_score(v):
"""
Plus le score est petit meilleur candidat. filename = v["path"].stem
""" filename_norm = normalize_title(filename)
similarity_score = filename_similarity_score(
filename_norm,
v["wg_title"]
)
is_short_slug = bool(
SHORT_SLUG_RE.match(filename_norm.replace(" ", ""))
)
long_title_penalty = (
"," in filename or
"_" in filename or
len(filename) > 40
)
return ( return (
v["is_category"], # False (0) meilleur que True (1) v["is_category"],
v["redirect"], # False meilleur v["redirect"],
"category:" in v["path"].name.lower(), # sécurité filename not is_short_slug,
len(v["path"].name), # stabilité long_title_penalty,
-similarity_score,
len(filename),
filename.lower(),
) )
@ -189,6 +255,7 @@ for article_id, variants in all_variants.items():
# tri déterministe # tri déterministe
variants_sorted = sorted(variants, key=variant_score) variants_sorted = sorted(variants, key=variant_score)
print(f"variants_sorted: {variants_sorted}")
chosen = variants_sorted[0] chosen = variants_sorted[0]
@ -198,16 +265,34 @@ for article_id, variants in all_variants.items():
if chosen["is_category"]: if chosen["is_category"]:
category_replaced += 1 category_replaced += 1
canonical_title = normalize_reference_key(chosen["title"])
canonical_pages[article_id] = { canonical_pages[article_id] = {
"path": chosen["path"], "path": chosen["path"],
"title": chosen["title"], "title": canonical_title,
"redirect": chosen["redirect"], "redirect": chosen["redirect"],
} }
# équivalences # équivalences
for v in variants: for v in variants:
equivalences[v["full_title"]] = chosen["title"] equivalences[v["canonical_key"]] = chosen["title"]
equivalences.clear()
def add_equivalence(k, v):
k = normalize_reference_key(k)
v = normalize_reference_key(v)
if k != v:
equivalences[k] = v
for article_id, variants in all_variants.items():
canonical_title = canonical_pages[article_id]["title"]
canonical_slug = Path(canonical_pages[article_id]["path"]).stem
for v in variants:
add_equivalence(v["canonical_key"], canonical_slug)
filename_key = normalize_title(Path(v["path"]).stem)
add_equivalence(filename_key, canonical_slug)
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}") print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
print(f"{category_replaced} 'category_*' remplacées par leur version de base") print(f"{category_replaced} 'category_*' remplacées par leur version de base")
@ -231,7 +316,7 @@ for src, dst in list(redirects.items()):
equivalences[src] = final equivalences[src] = final
redirects.clear() redirects.clear()
# -------------------------------------------------- # --------------------------------------------------
# PASS 4 — normalisation finale des equivalences # PASS 4 — normalisation finale des equivalences
# -------------------------------------------------- # --------------------------------------------------
@ -243,21 +328,41 @@ valid_titles = {
for k, v in list(equivalences.items()): for k, v in list(equivalences.items()):
if v not in valid_titles: if v not in valid_titles:
equivalences[k] = equivalences.get(v, v) equivalences[k] = equivalences.get(v, v)
# category:* ou category_* comme clés
for k, v in list(equivalences.items()):
new_k = re.sub(r"^category[\s:_]+", "category ", k)
if new_k != k:
equivalences[new_k] = v
del equivalences[k]
# invariant registry # invariant registry
for k, v in equivalences.items(): for k, v in equivalences.items():
if v not in valid_titles: if v not in valid_titles:
problems.append(f"Non canonical mapping: {k} -> {v}") problems.append(f"Non canonical mapping: {k} -> {v}")
equivalences = {
k: v for k, v in equivalences.items()
if k != v
}
# -------------------------------------------------- # --------------------------------------------------
# PASS 5 — copie des pages canoniques # PASS 5 — copie des pages canoniques
# -------------------------------------------------- # --------------------------------------------------
def title_to_filename(title: str) -> str:
return sanitize_filename(
title.replace(" ", "_").casefold() + ".html"
)
copied = 0 copied = 0
for key, data in canonical_pages.items(): total = len(canonical_pages)
for i, (key, data) in enumerate(canonical_pages.items(), 1):
src = data["path"] src = data["path"]
dst_name = sanitize_filename(src.name)
dst_name = sanitize_filename(src.name.casefold())
dst = PAGES_DIR / dst_name dst = PAGES_DIR / dst_name
try: try:
shutil.copy2(src, dst) shutil.copy2(src, dst)
canonical_pages[key] = dst_name canonical_pages[key] = dst_name
@ -265,6 +370,9 @@ for key, data in canonical_pages.items():
except Exception as e: except Exception as e:
problems.append(f"Copy failed {src}: {e}") problems.append(f"Copy failed {src}: {e}")
if i % 200 == 0 or i == total:
print(f"{i}/{total} copiés")
print(f"{copied} pages copiées") print(f"{copied} pages copiées")
# -------------------------------------------------- # --------------------------------------------------

View file

@ -1,48 +1,100 @@
from pathlib import Path from pathlib import Path
import json import json
import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse, parse_qs, unquote
INPUT_DIR = Path("../unique_pages") # --------------------------------------------------
REGISTRY_DIR = Path("../link_registry") # PATHS
# --------------------------------------------------
title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8")) PAGES_DIR = Path("../output/pages")
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8")) REGISTRY_PATH = Path("../output/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan")
OUTPUT_RESOLVED = [] OUTPUT_DIR.mkdir(exist_ok=True)
OUTPUT_UNRESOLVED = []
# ====================== # --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]
valid_targets = set(canonical_pages.values())
# --------------------------------------------------
# HELPERS # HELPERS
# ====================== # --------------------------------------------------
def normalize_title(title: str | None):
if not title:
return None
title = unquote(title)
title = title.replace("_", " ")
title = re.sub(r"\s+", " ", title.strip())
return title.casefold()
# -------------------------
# Extract MediaWiki target
# -------------------------
def extract_mediawiki_target(href: str):
def normalize_href(href: str):
if not href: if not href:
return None return None
# ignore external links # ignore anchors
if href.startswith("http"): if href.startswith("#"):
return None return None
name = Path(href).stem parsed = urlparse(href)
return name.lower()
# external link
if parsed.scheme in ("http", "https"):
return None
path = parsed.path or ""
# /wiki/Page_Name
if "/wiki/" in path:
return path.split("/wiki/", 1)[1]
# index.php?title=Page
if "index.php" in path:
qs = parse_qs(parsed.query)
if "title" in qs:
return qs["title"][0]
# fallback filename-like
return Path(path).stem
def resolve(name): # -------------------------
if name in title_registry: # Ignore unwanted namespaces
return name # -------------------------
if name in alias_registry: IGNORED_PREFIXES = (
return alias_registry[name] "file:",
"image:",
"template:",
"special:",
"help:",
"user:",
"talk:",
)
# try removing category prefix def is_ignored_namespace(title_norm: str):
if name.startswith("category_"): return title_norm.startswith(IGNORED_PREFIXES)
alt = name.replace("category_", "", 1)
if alt in title_registry:
return alt
return None
# -------------------------
# Extract article content
# -------------------------
def extract_article_links(soup): def extract_article_links(soup):
@ -52,33 +104,26 @@ def extract_article_links(soup):
links = [] links = []
for a in content.find_all("a", href=True): for a in content.select("a[href]"):
href = a["href"] # ignore navboxes / metadata
if a.find_parent(class_="navbox"):
# ignore anchors
if href.startswith("#"):
continue
# ignore files/images/history/etc
if any(prefix in href.lower() for prefix in [
"file_",
"image:",
"special:",
"action=",
]):
continue continue
href = a.get("href")
links.append(href) links.append(href)
return links return links
# ====================== # --------------------------------------------------
# MAIN # MAIN SCAN
# ====================== # --------------------------------------------------
files = list(INPUT_DIR.glob("*.html")) resolved_links = []
unresolved_links = []
files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser") print(f"{len(files)} pages à analyser")
for i, file_path in enumerate(files, 1): for i, file_path in enumerate(files, 1):
@ -90,33 +135,50 @@ for i, file_path in enumerate(files, 1):
for href in links: for href in links:
key = normalize_href(href) raw_target = extract_mediawiki_target(href)
if not key: norm = normalize_title(raw_target)
if not norm:
continue continue
resolved = resolve(key) if is_ignored_namespace(norm):
continue
entry = { entry = {
"source": file_path.name, "source": file_path.name,
"link": href, "href": href,
"normalized": norm,
} }
resolved = equivalences.get(norm)
if resolved: if resolved:
entry["target"] = resolved entry["resolved_title"] = resolved
OUTPUT_RESOLVED.append(entry) resolved_links.append(entry)
else: else:
OUTPUT_UNRESOLVED.append(entry) unresolved_links.append(entry)
if i % 100 == 0: if i % 100 == 0:
print(f"{i}/{len(files)} analysées") print(f"{i}/{len(files)} analysées")
# ====================== # --------------------------------------------------
# SAVE # SAVE RESULTS
# ====================== # --------------------------------------------------
json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2) json.dump(
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2) resolved_links,
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
json.dump(
unresolved_links,
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
print("\n✅ LINK SCAN COMPLETE") print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(OUTPUT_RESOLVED)) print("Resolved:", len(resolved_links))
print("Unresolved:", len(OUTPUT_UNRESOLVED)) print("Unresolved:", len(unresolved_links))