From 022a17221d8a949012d47466e47cc24b5959b0e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Fri, 10 Apr 2026 10:29:21 +0200 Subject: [PATCH] fix bad redirects in equivalences --- prepare_pages_and_registry.py | 135 +++++++++++++++++++++++++--------- 1 file changed, 99 insertions(+), 36 deletions(-) diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index ee391ba..8da8f36 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -7,6 +7,7 @@ from pathlib import Path from collections import defaultdict from difflib import SequenceMatcher from bs4 import BeautifulSoup +import unicodedata SOURCE_DIR = Path("../original_index") OUTPUT_DIR = Path("../output") @@ -29,12 +30,32 @@ NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"') WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"') SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$") +UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})') + +def decode_mediawiki_string(s: str) -> str: + if not s: + return s + + # 1 — HTML entities + s = html.unescape(s) + + # 2 — decode ONLY \uXXXX sequences (safe) + def repl(m): + return chr(int(m.group(1), 16)) + + s = UNICODE_ESCAPE_RE.sub(repl, s) + + return s + + def similarity(a, b): return SequenceMatcher(None, a, b).ratio() def normalize_title(title: str) -> str: title = title.strip() + title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ") + title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') title = re.sub(r"\s+", " ", title) return title.casefold() @@ -47,7 +68,7 @@ def sanitize_filename(name: str) -> str: def extract_wg_page_name(page_html: str) -> str | None: m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html) if m: - return html.unescape(m.group(1)).replace("_", " ") + return decode_mediawiki_string(m.group(1)).replace("_", " ") return None @@ -57,7 +78,7 @@ def extract_page_identity(page_html: str): return page m = re.search(r"(.*?) -", page_html, re.I) if m: - return html.unescape(m.group(1)) + return decode_mediawiki_string(m.group(1)) return None @@ -73,7 +94,7 @@ def extract_article_id(page_html: str) -> int | None: def extract_internal_redirect(page_html: str): m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html) if m: - return html.unescape(m.group(1)).replace("_", " ") + return decode_mediawiki_string(m.group(1)).replace("_", " ") return None @@ -87,7 +108,7 @@ def extract_namespace(page_html: str) -> str: def extract_wg_title(page_html): m = WG_TITLE_RE.search(page_html) if m: - return html.unescape(m.group(1)) + return decode_mediawiki_string(m.group(1)) return None @@ -97,14 +118,16 @@ def normalize_reference_key(key: str) -> str: # normalise namespace category key = re.sub(r"^category[\s:_]+", "", key) + # normalise les apostrophes typographiques → ascii + key = key.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') + + # collapse espaces key = re.sub(r"\s+", " ", key) return key.strip() - - def has_editorial_content(html_page: str) -> bool: soup = BeautifulSoup(html_page, "html.parser") @@ -129,6 +152,7 @@ def has_editorial_content(html_page: str) -> bool: return len(editorial_text) > 200 + # -------------------------------------------------- # Registry structures # -------------------------------------------------- @@ -145,9 +169,11 @@ print(f"{len(files)} fichiers trouvés") # PASS 1 — analyse et collecte des variantes # -------------------------------------------------- +category_redirects = {} + for i, path in enumerate(files, 1): try: - page_html = path.read_text(encoding="utf-8", errors="ignore") + page_html = path.read_text(encoding="utf-8", errors="replace") article_id = extract_article_id(page_html) if not article_id: @@ -166,7 +192,7 @@ for i, path in enumerate(files, 1): ignored_pages.append(path.name) continue - title = html.unescape(title) + title = decode_mediawiki_string(title) norm = normalize_title(title) page_name = extract_wg_page_name(page_html) full_title = normalize_title(page_name) if page_name else norm @@ -182,7 +208,7 @@ for i, path in enumerate(files, 1): m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html) if m_title: - wg_title = html.unescape(m_title.group(1)) + wg_title = decode_mediawiki_string(m_title.group(1)) cat_base = normalize_title(wg_title) page_norm = normalize_title(page_name) if page_name else None @@ -201,7 +227,12 @@ for i, path in enumerate(files, 1): # redirect interne redir = extract_internal_redirect(page_html) if redir: - redirects[full_title] = normalize_title(redir) + key = full_title + target = normalize_title(redir) + if is_listing_only or is_category: + category_redirects[key] = target + else: + redirects[key] = target canonical_key = normalize_reference_key(full_title) all_variants[article_id].append({ @@ -221,6 +252,7 @@ for i, path in enumerate(files, 1): if i % 200 == 0: print(f"{i}/{len(files)} analysés") print("Variants collected:", len(all_variants)) +print("Added category_redirect from category/listing:", len(category_redirects)) # -------------------------------------------------- # PASS 2 — choix des versions canoniques @@ -284,8 +316,9 @@ def variant_score(v): def add_equivalence(k, v): k = normalize_reference_key(k) v = normalize_reference_key(v) - if k != v: + if v not in [d["title"] for d in canonical_pages.values()]: + print("⚠️ Adding equivalence to NON-CANONICAL value:", k, "->", v) equivalences[k] = v @@ -310,9 +343,6 @@ for article_id, variants in all_variants.items(): "redirect": chosen["redirect"], } - if chosen["wg_title"]: - add_equivalence(chosen["wg_title"], canonical_slug) - for v in variants: if v["is_category"] and not v["is_listing_only"]: # catégorie non choisie @@ -342,14 +372,59 @@ def resolve_redirect(key): key = redirects[key] return key + +def resolve_all(key): + seen = set() + while key not in seen: + seen.add(key) + + if key in redirects: + key = redirects[key] + continue + + if key in equivalences: + key = equivalences[key] + continue + + break + + return key + + +skipped_redirect = 0 +ignored_redirect = 0 + +valid_titles = { + data["title"] + for data in canonical_pages.values() +} + +for k, v in category_redirects.items(): + if k == v: + continue + final = resolve_all(v) + if final in valid_titles and k != final: + equivalences[k] = final + for k, v in list(redirects.items()): - equivalences[k] = resolve_redirect(v) + if k == v: + continue + final = resolve_all(v) + if final in valid_titles and k != final: + equivalences[k] = final + else: + skipped_redirect += 1 for src, dst in list(redirects.items()): final = equivalences.get(dst, dst) - equivalences[src] = final + if final in valid_titles and src != final: + equivalences[src] = final + else: + ignored_redirect += 1 + +print(f"Skipped redirect to non-canonical: {skipped_redirect}") +print(f"Ignored redirect (non-canonical): {ignored_redirect}") -redirects.clear() # -------------------------------------------------- # PASS 4 — normalisation finale des equivalences # -------------------------------------------------- @@ -361,24 +436,13 @@ def resolve_equivalence(key): key = equivalences[key] return key + for k in list(equivalences): - equivalences[k] = resolve_equivalence(equivalences[k]) + final = resolve_equivalence(equivalences[k]) + if final in valid_titles: + equivalences[k] = final -valid_titles = { - data["title"] - for data in canonical_pages.values() -} -for k, v in list(equivalences.items()): - if v not in valid_titles: - equivalences[k] = equivalences.get(v, v) -# category:* ou category_* comme clés -for k, v in list(equivalences.items()): - new_k = re.sub(r"^category[\s:_]+", "category ", k) - if new_k != k: - equivalences[new_k] = v - del equivalences[k] -# invariant registry for k, v in equivalences.items(): if v not in valid_titles: problems.append(f"Non canonical mapping: {k} -> {v}") @@ -387,17 +451,16 @@ equivalences = { k: v for k, v in equivalences.items() if k != v } +print(f"Equivalences kept: {len(equivalences)}") + -for k in list(equivalences): - equivalences[k] = resolve_equivalence(equivalences[k]) # -------------------------------------------------- # PASS 5 — copie des pages canoniques # -------------------------------------------------- - def title_to_filename(title: str) -> str: return sanitize_filename( - title.replace(" ", "_").casefold() + ".html" + title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html" )