fix bad redirects in equivalences

This commit is contained in:
Maxime Réaux 2026-04-10 10:29:21 +02:00
parent 556d6f1e03
commit 022a17221d

View file

@ -7,6 +7,7 @@ from pathlib import Path
from collections import defaultdict from collections import defaultdict
from difflib import SequenceMatcher from difflib import SequenceMatcher
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import unicodedata
SOURCE_DIR = Path("../original_index") SOURCE_DIR = Path("../original_index")
OUTPUT_DIR = Path("../output") OUTPUT_DIR = Path("../output")
@ -29,12 +30,32 @@ NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"') WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$") SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
def decode_mediawiki_string(s: str) -> str:
if not s:
return s
# 1 — HTML entities
s = html.unescape(s)
# 2 — decode ONLY \uXXXX sequences (safe)
def repl(m):
return chr(int(m.group(1), 16))
s = UNICODE_ESCAPE_RE.sub(repl, s)
return s
def similarity(a, b): def similarity(a, b):
return SequenceMatcher(None, a, b).ratio() return SequenceMatcher(None, a, b).ratio()
def normalize_title(title: str) -> str: def normalize_title(title: str) -> str:
title = title.strip() title = title.strip()
title = unicodedata.normalize("NFKC", title)
title = title.replace("_", " ") title = title.replace("_", " ")
title = title.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
title = re.sub(r"\s+", " ", title) title = re.sub(r"\s+", " ", title)
return title.casefold() return title.casefold()
@ -47,7 +68,7 @@ def sanitize_filename(name: str) -> str:
def extract_wg_page_name(page_html: str) -> str | None: def extract_wg_page_name(page_html: str) -> str | None:
m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html) m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html)
if m: if m:
return html.unescape(m.group(1)).replace("_", " ") return decode_mediawiki_string(m.group(1)).replace("_", " ")
return None return None
@ -57,7 +78,7 @@ def extract_page_identity(page_html: str):
return page return page
m = re.search(r"<title>(.*?) -", page_html, re.I) m = re.search(r"<title>(.*?) -", page_html, re.I)
if m: if m:
return html.unescape(m.group(1)) return decode_mediawiki_string(m.group(1))
return None return None
@ -73,7 +94,7 @@ def extract_article_id(page_html: str) -> int | None:
def extract_internal_redirect(page_html: str): def extract_internal_redirect(page_html: str):
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html) m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
if m: if m:
return html.unescape(m.group(1)).replace("_", " ") return decode_mediawiki_string(m.group(1)).replace("_", " ")
return None return None
@ -87,7 +108,7 @@ def extract_namespace(page_html: str) -> str:
def extract_wg_title(page_html): def extract_wg_title(page_html):
m = WG_TITLE_RE.search(page_html) m = WG_TITLE_RE.search(page_html)
if m: if m:
return html.unescape(m.group(1)) return decode_mediawiki_string(m.group(1))
return None return None
@ -97,14 +118,16 @@ def normalize_reference_key(key: str) -> str:
# normalise namespace category # normalise namespace category
key = re.sub(r"^category[\s:_]+", "", key) key = re.sub(r"^category[\s:_]+", "", key)
# normalise les apostrophes typographiques → ascii
key = key.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
# collapse espaces # collapse espaces
key = re.sub(r"\s+", " ", key) key = re.sub(r"\s+", " ", key)
return key.strip() return key.strip()
def has_editorial_content(html_page: str) -> bool: def has_editorial_content(html_page: str) -> bool:
soup = BeautifulSoup(html_page, "html.parser") soup = BeautifulSoup(html_page, "html.parser")
@ -129,6 +152,7 @@ def has_editorial_content(html_page: str) -> bool:
return len(editorial_text) > 200 return len(editorial_text) > 200
# -------------------------------------------------- # --------------------------------------------------
# Registry structures # Registry structures
# -------------------------------------------------- # --------------------------------------------------
@ -145,9 +169,11 @@ print(f"{len(files)} fichiers trouvés")
# PASS 1 — analyse et collecte des variantes # PASS 1 — analyse et collecte des variantes
# -------------------------------------------------- # --------------------------------------------------
category_redirects = {}
for i, path in enumerate(files, 1): for i, path in enumerate(files, 1):
try: try:
page_html = path.read_text(encoding="utf-8", errors="ignore") page_html = path.read_text(encoding="utf-8", errors="replace")
article_id = extract_article_id(page_html) article_id = extract_article_id(page_html)
if not article_id: if not article_id:
@ -166,7 +192,7 @@ for i, path in enumerate(files, 1):
ignored_pages.append(path.name) ignored_pages.append(path.name)
continue continue
title = html.unescape(title) title = decode_mediawiki_string(title)
norm = normalize_title(title) norm = normalize_title(title)
page_name = extract_wg_page_name(page_html) page_name = extract_wg_page_name(page_html)
full_title = normalize_title(page_name) if page_name else norm full_title = normalize_title(page_name) if page_name else norm
@ -182,7 +208,7 @@ for i, path in enumerate(files, 1):
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html) m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
if m_title: if m_title:
wg_title = html.unescape(m_title.group(1)) wg_title = decode_mediawiki_string(m_title.group(1))
cat_base = normalize_title(wg_title) cat_base = normalize_title(wg_title)
page_norm = normalize_title(page_name) if page_name else None page_norm = normalize_title(page_name) if page_name else None
@ -201,7 +227,12 @@ for i, path in enumerate(files, 1):
# redirect interne # redirect interne
redir = extract_internal_redirect(page_html) redir = extract_internal_redirect(page_html)
if redir: if redir:
redirects[full_title] = normalize_title(redir) key = full_title
target = normalize_title(redir)
if is_listing_only or is_category:
category_redirects[key] = target
else:
redirects[key] = target
canonical_key = normalize_reference_key(full_title) canonical_key = normalize_reference_key(full_title)
all_variants[article_id].append({ all_variants[article_id].append({
@ -221,6 +252,7 @@ for i, path in enumerate(files, 1):
if i % 200 == 0: if i % 200 == 0:
print(f"{i}/{len(files)} analysés") print(f"{i}/{len(files)} analysés")
print("Variants collected:", len(all_variants)) print("Variants collected:", len(all_variants))
print("Added category_redirect from category/listing:", len(category_redirects))
# -------------------------------------------------- # --------------------------------------------------
# PASS 2 — choix des versions canoniques # PASS 2 — choix des versions canoniques
@ -284,8 +316,9 @@ def variant_score(v):
def add_equivalence(k, v): def add_equivalence(k, v):
k = normalize_reference_key(k) k = normalize_reference_key(k)
v = normalize_reference_key(v) v = normalize_reference_key(v)
if k != v: if k != v:
if v not in [d["title"] for d in canonical_pages.values()]:
print("⚠️ Adding equivalence to NON-CANONICAL value:", k, "->", v)
equivalences[k] = v equivalences[k] = v
@ -310,9 +343,6 @@ for article_id, variants in all_variants.items():
"redirect": chosen["redirect"], "redirect": chosen["redirect"],
} }
if chosen["wg_title"]:
add_equivalence(chosen["wg_title"], canonical_slug)
for v in variants: for v in variants:
if v["is_category"] and not v["is_listing_only"]: if v["is_category"] and not v["is_listing_only"]:
# catégorie non choisie # catégorie non choisie
@ -342,14 +372,59 @@ def resolve_redirect(key):
key = redirects[key] key = redirects[key]
return key return key
def resolve_all(key):
seen = set()
while key not in seen:
seen.add(key)
if key in redirects:
key = redirects[key]
continue
if key in equivalences:
key = equivalences[key]
continue
break
return key
skipped_redirect = 0
ignored_redirect = 0
valid_titles = {
data["title"]
for data in canonical_pages.values()
}
for k, v in category_redirects.items():
if k == v:
continue
final = resolve_all(v)
if final in valid_titles and k != final:
equivalences[k] = final
for k, v in list(redirects.items()): for k, v in list(redirects.items()):
equivalences[k] = resolve_redirect(v) if k == v:
continue
final = resolve_all(v)
if final in valid_titles and k != final:
equivalences[k] = final
else:
skipped_redirect += 1
for src, dst in list(redirects.items()): for src, dst in list(redirects.items()):
final = equivalences.get(dst, dst) final = equivalences.get(dst, dst)
if final in valid_titles and src != final:
equivalences[src] = final equivalences[src] = final
else:
ignored_redirect += 1
print(f"Skipped redirect to non-canonical: {skipped_redirect}")
print(f"Ignored redirect (non-canonical): {ignored_redirect}")
redirects.clear()
# -------------------------------------------------- # --------------------------------------------------
# PASS 4 — normalisation finale des equivalences # PASS 4 — normalisation finale des equivalences
# -------------------------------------------------- # --------------------------------------------------
@ -361,24 +436,13 @@ def resolve_equivalence(key):
key = equivalences[key] key = equivalences[key]
return key return key
for k in list(equivalences): for k in list(equivalences):
equivalences[k] = resolve_equivalence(equivalences[k]) final = resolve_equivalence(equivalences[k])
if final in valid_titles:
equivalences[k] = final
valid_titles = {
data["title"]
for data in canonical_pages.values()
}
for k, v in list(equivalences.items()):
if v not in valid_titles:
equivalences[k] = equivalences.get(v, v)
# category:* ou category_* comme clés
for k, v in list(equivalences.items()):
new_k = re.sub(r"^category[\s:_]+", "category ", k)
if new_k != k:
equivalences[new_k] = v
del equivalences[k]
# invariant registry
for k, v in equivalences.items(): for k, v in equivalences.items():
if v not in valid_titles: if v not in valid_titles:
problems.append(f"Non canonical mapping: {k} -> {v}") problems.append(f"Non canonical mapping: {k} -> {v}")
@ -387,17 +451,16 @@ equivalences = {
k: v for k, v in equivalences.items() k: v for k, v in equivalences.items()
if k != v if k != v
} }
print(f"Equivalences kept: {len(equivalences)}")
for k in list(equivalences):
equivalences[k] = resolve_equivalence(equivalences[k])
# -------------------------------------------------- # --------------------------------------------------
# PASS 5 — copie des pages canoniques # PASS 5 — copie des pages canoniques
# -------------------------------------------------- # --------------------------------------------------
def title_to_filename(title: str) -> str: def title_to_filename(title: str) -> str:
return sanitize_filename( return sanitize_filename(
title.replace(" ", "_").casefold() + ".html" title.replace(" ", "_").replace("", "'").replace("", "'").replace("", '"').replace("", '"').casefold() + ".html"
) )