From 556d6f1e035ee5b927c4a9c0c295db9a5c060a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Thu, 9 Apr 2026 12:05:15 +0200 Subject: [PATCH] capture potential_tags --- prepare_pages_and_registry.py | 156 ++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 54 deletions(-) diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index 1b3ea96..ee391ba 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -6,6 +6,7 @@ import html from pathlib import Path from collections import defaultdict from difflib import SequenceMatcher +from bs4 import BeautifulSoup SOURCE_DIR = Path("../original_index") OUTPUT_DIR = Path("../output") @@ -50,18 +51,18 @@ def extract_wg_page_name(page_html: str) -> str | None: return None -def extract_page_identity(html: str): - page = extract_wg_page_name(html) +def extract_page_identity(page_html: str): + page = extract_wg_page_name(page_html) if page: return page - m = re.search(r"(.*?) -", html, re.I) + m = re.search(r"<title>(.*?) -", page_html, re.I) if m: return html.unescape(m.group(1)) return None -def extract_article_id(html: str) -> int | None: - m = ARTICLE_ID_RE.search(html) +def extract_article_id(page_html: str) -> int | None: + m = ARTICLE_ID_RE.search(page_html) if m: aid = int(m.group(1)) if aid > 0: @@ -76,8 +77,8 @@ def extract_internal_redirect(page_html: str): return None -def extract_namespace(html: str) -> str: - m = NAMESPACE_RE.search(html) +def extract_namespace(page_html: str) -> str: + m = NAMESPACE_RE.search(page_html) if m: return m.group(1) return "" @@ -102,6 +103,32 @@ def normalize_reference_key(key: str) -> str: return key.strip() + + +def has_editorial_content(html_page: str) -> bool: + soup = BeautifulSoup(html_page, "html.parser") + + content = soup.find(id="mw-content-text") + if not content: + return False + + auto = content.select_one(".mw-category-generated") + if not auto: + return True # pas une catégorie auto + + # texte AVANT le listing + editorial_text = "" + + for child in content.children: + if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []): + break + editorial_text += child.get_text(" ", strip=True) + + editorial_text = editorial_text.strip() + + return len(editorial_text) > 200 + + # -------------------------------------------------- # Registry structures # -------------------------------------------------- @@ -146,6 +173,8 @@ for i, path in enumerate(files, 1): base_title = norm is_redirect = bool(IS_REDIRECT_RE.search(page_html)) is_category = ns == "Category" or norm.startswith("category:") + has_content = has_editorial_content(page_html) + is_listing_only = is_category and not has_content wg_title = extract_wg_title(page_html) # Categories @@ -178,11 +207,12 @@ for i, path in enumerate(files, 1): all_variants[article_id].append({ "path": path, "title": base_title, - "canonical_key": full_title, + "canonical_key": canonical_key, "article_id": article_id, "wg_title": normalize_title(wg_title) if wg_title else None, "redirect": is_redirect, "is_category": is_category, + "is_listing_only": is_listing_only, }) except Exception as e: @@ -197,9 +227,10 @@ print("Variants collected:", len(all_variants)) # -------------------------------------------------- canonical_pages = {} +potential_tags = defaultdict(list) equivalences = {} -category_replaced = 0 -nb_all_cat = 0 +category_renamed = 0 +category_not_chosen = 0 def slug_to_title(filename: str) -> str: @@ -241,7 +272,7 @@ def variant_score(v): ) return ( - v["is_category"], + v["is_listing_only"], v["redirect"], not is_short_slug, long_title_penalty, @@ -250,35 +281,6 @@ def variant_score(v): filename.lower(), ) - -for article_id, variants in all_variants.items(): - - # tri déterministe - variants_sorted = sorted(variants, key=variant_score) - print(f"variants_sorted: {variants_sorted}") - - chosen = variants_sorted[0] - - if all(v["is_category"] for v in variants): - nb_all_cat += 1 - - if chosen["is_category"]: - category_replaced += 1 - - canonical_title = normalize_reference_key(chosen["title"]) - - canonical_pages[article_id] = { - "path": chosen["path"], - "title": canonical_title, - "redirect": chosen["redirect"], - } - - # équivalences - for v in variants: - equivalences[v["canonical_key"]] = chosen["title"] - -equivalences.clear() - def add_equivalence(k, v): k = normalize_reference_key(k) v = normalize_reference_key(v) @@ -286,16 +288,48 @@ def add_equivalence(k, v): if k != v: equivalences[k] = v -for article_id, variants in all_variants.items(): - canonical_title = canonical_pages[article_id]["title"] - canonical_slug = Path(canonical_pages[article_id]["path"]).stem - for v in variants: - add_equivalence(v["canonical_key"], canonical_slug) - filename_key = normalize_title(Path(v["path"]).stem) - add_equivalence(filename_key, canonical_slug) -print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}") -print(f"{category_replaced} 'category_*' remplacées par leur version de base") +for article_id, variants in all_variants.items(): + variants_sorted = sorted(variants, key=variant_score) + chosen = variants_sorted[0] + + canonical_slug = normalize_reference_key(chosen["path"].stem) + + # categories listing-only + if chosen["is_listing_only"]: + tag_name = normalize_reference_key(chosen["title"]) + for v in variants: + potential_tags[tag_name].append(normalize_title(v["path"].stem)) + if v["wg_title"]: + potential_tags[tag_name].append(normalize_reference_key(v["wg_title"])) + continue + + canonical_pages[article_id] = { + "path": chosen["path"], + "title": canonical_slug, + "redirect": chosen["redirect"], + } + + if chosen["wg_title"]: + add_equivalence(chosen["wg_title"], canonical_slug) + + for v in variants: + if v["is_category"] and not v["is_listing_only"]: + # catégorie non choisie + if v is not chosen: + category_not_chosen += 1 + # catégorie choisie mais qui est une category_* → renommée + elif chosen["path"].stem.lower().startswith("category"): + category_renamed += 1 + + if v is not chosen: + filename_key = normalize_title(Path(v["path"]).stem) + add_equivalence(filename_key, canonical_slug) + +print(f"{len(canonical_pages)} pages canoniques") +print(f"{category_not_chosen} pages homonymes 'category_*' non retenues") +print(f"{category_renamed} pages prefix 'category_*' renommées") +print(f"{len(potential_tags)} potential_tags enregistrés") # -------------------------------------------------- # PASS 3 — resolve redirects @@ -320,6 +354,16 @@ redirects.clear() # PASS 4 — normalisation finale des equivalences # -------------------------------------------------- +def resolve_equivalence(key): + seen = set() + while key in equivalences and key not in seen: + seen.add(key) + key = equivalences[key] + return key + +for k in list(equivalences): + equivalences[k] = resolve_equivalence(equivalences[k]) + valid_titles = { data["title"] for data in canonical_pages.values() @@ -338,10 +382,14 @@ for k, v in list(equivalences.items()): for k, v in equivalences.items(): if v not in valid_titles: problems.append(f"Non canonical mapping: {k} -> {v}") + equivalences = { k: v for k, v in equivalences.items() if k != v } + +for k in list(equivalences): + equivalences[k] = resolve_equivalence(equivalences[k]) # -------------------------------------------------- # PASS 5 — copie des pages canoniques # -------------------------------------------------- @@ -356,16 +404,15 @@ def title_to_filename(title: str) -> str: copied = 0 total = len(canonical_pages) -for i, (key, data) in enumerate(canonical_pages.items(), 1): +for i, (article_id, data) in enumerate(canonical_pages.items(), 1): src = data["path"] - - dst_name = sanitize_filename(src.name.casefold()) + dst_name = title_to_filename(data["title"]) dst = PAGES_DIR / dst_name try: shutil.copy2(src, dst) - canonical_pages[key] = dst_name + canonical_pages[article_id] = dst_name copied += 1 except Exception as e: problems.append(f"Copy failed {src}: {e}") @@ -383,6 +430,7 @@ registry = { "canonical_pages": canonical_pages, "equivalences": equivalences, "redirects": redirects, + "potential_tags": potential_tags, "ignored_pages": ignored_pages, } @@ -401,7 +449,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f: f.write(f"Redirects: {len(redirects)}\n") f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Problems: {len(problems)}\n\n") - for p in problems[:200]: + for p in problems: f.write(p + "\n") print("\n✅ PREPARATION COMPLETE") \ No newline at end of file