From a3f3e61e7d83c0e3ad5ace14f6801b35adafcc17 Mon Sep 17 00:00:00 2001 From: maximator Date: Sat, 4 Apr 2026 01:32:39 +0200 Subject: [PATCH] fix page dedup --- prepare_pages_and_registry.py | 154 +++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 66 deletions(-) diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index 4b0ac16..b5784fb 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -48,12 +48,9 @@ def extract_page_identity(html: str): page = extract_wg_page_name(html) if page: return page - - # fallback title tag m = re.search(r"(.*?) -", html, re.I) if m: return html.unescape(m.group(1)) - return None @@ -66,18 +63,6 @@ def extract_article_id(html: str) -> int | None: return None -def extract_redirect(html: str) -> str | None: - m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I) - if m: - return m.group(1).strip() - return None - - -def namespace_of(title: str): - if ":" in title: - return title.split(":", 1)[0] - - def extract_internal_redirect(page_html: str): m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html) if m: @@ -91,26 +76,24 @@ def extract_namespace(html: str) -> str: return m.group(1) return "" + # -------------------------------------------------- # Registry structures # -------------------------------------------------- -canonical_pages = {} -equivalences = {} -redirects = {} ignored_pages = [] problems = [] +redirects = {} +all_variants = defaultdict(list) files = list(SOURCE_DIR.glob("*.html")) print(f"{len(files)} fichiers trouvés") - # -------------------------------------------------- -# PASS 1 — analyse +# PASS 1 — analyse et collecte des variantes # -------------------------------------------------- for i, path in enumerate(files, 1): - try: page_html = path.read_text(encoding="utf-8", errors="ignore") @@ -126,50 +109,51 @@ for i, path in enumerate(files, 1): ns = extract_namespace(page_html) - # ------------------------- - # Ignore namespaces - # ------------------------- + # Ignorer certains namespaces if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"): ignored_pages.append(path.name) continue title = html.unescape(title) norm = normalize_title(title) + base_title = norm + is_redirect = bool(IS_REDIRECT_RE.search(page_html)) + is_category = ns == "Category" or norm.startswith("category:") - - # ------------------------- - # Category pages - # ------------------------- - # Category pages CAN be canonical content - if ns == "Category": - norm = normalize_title(title) - equivalences[norm] = norm - - # ------------------------- - # Redirect detection - # ------------------------- + # redirect interne redir = extract_internal_redirect(page_html) if redir: redirects[normalize_title(redir)] = norm - # ------------------------- - # Canonical article - # ------------------------- - is_redirect = bool(IS_REDIRECT_RE.search(page_html)) - if article_id not in canonical_pages: - canonical_pages[article_id] = { - "path": path, - "title": norm, - "redirect": is_redirect, - } - elif canonical_pages[article_id]["redirect"] and not is_redirect: - canonical_pages[article_id] = { - "path": path, - "title": norm, - "redirect": is_redirect, - } - # self equivalence - equivalences[norm] = norm + # Categories + if ns == "Category": + m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html) + + if m_title: + wg_title = html.unescape(m_title.group(1)) + cat_base = normalize_title(wg_title) + + page_name = extract_wg_page_name(page_html) + page_norm = normalize_title(page_name) if page_name else None + + if page_norm and page_norm != f"category:{cat_base}": + # page réelle déguisée en category + base_title = page_norm + is_category = False + else: + base_title = cat_base + is_category = True + else: + base_title = norm.replace("category:", "", 1) + is_category = True + + all_variants[article_id].append({ + "path": path, + "title": base_title, + "article_id": article_id, + "redirect": is_redirect, + "is_category": is_category, + }) except Exception as e: problems.append(f"{path}: {e}") @@ -177,9 +161,57 @@ for i, path in enumerate(files, 1): if i % 200 == 0: print(f"{i}/{len(files)} analysés") +# -------------------------------------------------- +# PASS 2 — choix des versions canoniques +# -------------------------------------------------- + +canonical_pages = {} +equivalences = {} +category_replaced = 0 +nb_all_cat = 0 + + +def variant_score(v): + """ + Plus le score est petit → meilleur candidat. + """ + return ( + v["is_category"], # False (0) meilleur que True (1) + v["redirect"], # False meilleur + "category:" in v["path"].name.lower(), # sécurité filename + len(v["path"].name), # stabilité + ) + + +for article_id, variants in all_variants.items(): + + # tri déterministe + variants_sorted = sorted(variants, key=variant_score) + + chosen = variants_sorted[0] + + if all(v["is_category"] for v in variants): + nb_all_cat += 1 + + if chosen["is_category"]: + category_replaced += 1 + + canonical_pages[article_id] = { + "path": chosen["path"], + "title": chosen["title"], + "redirect": chosen["redirect"], + } + + # équivalences + for v in variants: + equivalences[v["title"]] = chosen["title"] + + +print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}") +print(f"{category_replaced} 'category_*' remplacées par leur version de base") # -------------------------------------------------- -# PASS 2 — resolve redirects +# PASS 3 — resolve redirects # -------------------------------------------------- def resolve_redirect(key): @@ -189,24 +221,18 @@ def resolve_redirect(key): key = redirects[key] return key - for k, v in list(redirects.items()): equivalences[k] = resolve_redirect(v) - # -------------------------------------------------- -# PASS 3 — copy canonical pages +# PASS 4 — copie des pages canoniques # -------------------------------------------------- copied = 0 - -# for key, src in canonical_pages.items(): for key, data in canonical_pages.items(): - src = data["path"] dst_name = sanitize_filename(src.name) dst = PAGES_DIR / dst_name - try: shutil.copy2(src, dst) canonical_pages[key] = dst_name @@ -216,7 +242,6 @@ for key, data in canonical_pages.items(): print(f"{copied} pages copiées") - # -------------------------------------------------- # SAVE REGISTRY # -------------------------------------------------- @@ -229,11 +254,9 @@ registry = { } REGISTRY_PATH.parent.mkdir(exist_ok=True) - with open(REGISTRY_PATH, "w", encoding="utf-8") as f: json.dump(registry, f, indent=2, ensure_ascii=False) - # -------------------------------------------------- # REPORT # -------------------------------------------------- @@ -245,7 +268,6 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f: f.write(f"Redirects: {len(redirects)}\n") f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Problems: {len(problems)}\n\n") - for p in problems[:200]: f.write(p + "\n")