capture potential_tags

2026-04-09 12:05:15 +02:00 · 2026-04-09 12:05:15 +02:00 · 556d6f1e03
commit 556d6f1e03
parent 7f019ed98c
1 changed files with 102 additions and 54 deletions
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@ -6,6 +6,7 @@ import html
 from pathlib import Path
 from collections import defaultdict
 from difflib import SequenceMatcher
 from bs4 import BeautifulSoup
 SOURCE_DIR = Path("../original_index")
 OUTPUT_DIR = Path("../output")
@ -50,18 +51,18 @@ def extract_wg_page_name(page_html: str) -> str | None:
    return None
-def extract_page_identity(html: str):
+def extract_page_identity(page_html: str):
-    page = extract_wg_page_name(html)
+    page = extract_wg_page_name(page_html)
    if page:
        return page
-    m = re.search(r"<title>(.*?) -", html, re.I)
+    m = re.search(r"<title>(.*?) -", page_html, re.I)
    if m:
        return html.unescape(m.group(1))
    return None
-def extract_article_id(html: str) -> int | None:
+def extract_article_id(page_html: str) -> int | None:
-    m = ARTICLE_ID_RE.search(html)
+    m = ARTICLE_ID_RE.search(page_html)
    if m:
        aid = int(m.group(1))
        if aid > 0:
@ -76,8 +77,8 @@ def extract_internal_redirect(page_html: str):
    return None
-def extract_namespace(html: str) -> str:
+def extract_namespace(page_html: str) -> str:
-    m = NAMESPACE_RE.search(html)
+    m = NAMESPACE_RE.search(page_html)
    if m:
        return m.group(1)
    return ""
@ -102,6 +103,32 @@ def normalize_reference_key(key: str) -> str:
    return key.strip()
 def has_editorial_content(html_page: str) -> bool:
    soup = BeautifulSoup(html_page, "html.parser")
    content = soup.find(id="mw-content-text")
    if not content:
        return False
    auto = content.select_one(".mw-category-generated")
    if not auto:
        return True  # pas une catégorie auto
    # texte AVANT le listing
    editorial_text = ""
    for child in content.children:
        if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []):
            break
        editorial_text += child.get_text(" ", strip=True)
    editorial_text = editorial_text.strip()
    return len(editorial_text) > 200
 # --------------------------------------------------
 # Registry structures
 # --------------------------------------------------
@ -146,6 +173,8 @@ for i, path in enumerate(files, 1):
        base_title = norm
        is_redirect = bool(IS_REDIRECT_RE.search(page_html))
        is_category = ns == "Category" or norm.startswith("category:")
        has_content = has_editorial_content(page_html)
        is_listing_only = is_category and not has_content
        wg_title = extract_wg_title(page_html)
        # Categories
@ -178,11 +207,12 @@ for i, path in enumerate(files, 1):
        all_variants[article_id].append({
            "path": path,
            "title": base_title,
-            "canonical_key": full_title,
+            "canonical_key": canonical_key,
            "article_id": article_id,
            "wg_title": normalize_title(wg_title) if wg_title else None,
            "redirect": is_redirect,
            "is_category": is_category,
            "is_listing_only": is_listing_only,
        })
    except Exception as e:
@ -197,9 +227,10 @@ print("Variants collected:", len(all_variants))
 # --------------------------------------------------
 canonical_pages = {}
 potential_tags = defaultdict(list)
 equivalences = {}
-category_replaced = 0
+category_renamed = 0
-nb_all_cat = 0
+category_not_chosen = 0
 def slug_to_title(filename: str) -> str:
@ -241,7 +272,7 @@ def variant_score(v):
    )
    return (
-        v["is_category"],
+        v["is_listing_only"],
        v["redirect"],
        not is_short_slug,
        long_title_penalty,
@ -250,35 +281,6 @@ def variant_score(v):
        filename.lower(),
    )
 for article_id, variants in all_variants.items():
    # tri déterministe
    variants_sorted = sorted(variants, key=variant_score)
    print(f"variants_sorted: {variants_sorted}")
    chosen = variants_sorted[0]
    if all(v["is_category"] for v in variants):
        nb_all_cat += 1
    if chosen["is_category"]:
        category_replaced += 1
    canonical_title = normalize_reference_key(chosen["title"])
    canonical_pages[article_id] = {
        "path": chosen["path"],
        "title": canonical_title,
        "redirect": chosen["redirect"],
    }
    # équivalences
    for v in variants:
        equivalences[v["canonical_key"]] = chosen["title"]
 equivalences.clear()
 def add_equivalence(k, v):
    k = normalize_reference_key(k)
    v = normalize_reference_key(v)
@ -286,16 +288,48 @@ def add_equivalence(k, v):
    if k != v:
        equivalences[k] = v
 for article_id, variants in all_variants.items():
-    canonical_title = canonical_pages[article_id]["title"]
+    variants_sorted = sorted(variants, key=variant_score)
-    canonical_slug = Path(canonical_pages[article_id]["path"]).stem
+    chosen = variants_sorted[0]
    canonical_slug = normalize_reference_key(chosen["path"].stem)
    # categories listing-only
    if chosen["is_listing_only"]:
        tag_name = normalize_reference_key(chosen["title"])
        for v in variants:
-        add_equivalence(v["canonical_key"], canonical_slug)
+            potential_tags[tag_name].append(normalize_title(v["path"].stem))
            if v["wg_title"]:
                potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
        continue
    canonical_pages[article_id] = {
        "path": chosen["path"],
        "title": canonical_slug,
        "redirect": chosen["redirect"],
    }
    if chosen["wg_title"]:
        add_equivalence(chosen["wg_title"], canonical_slug)
    for v in variants:
        if v["is_category"] and not v["is_listing_only"]:
            # catégorie non choisie
            if v is not chosen:
                category_not_chosen += 1
            # catégorie choisie mais qui est une category_* → renommée
            elif chosen["path"].stem.lower().startswith("category"):
                category_renamed += 1
        if v is not chosen:
            filename_key = normalize_title(Path(v["path"]).stem)
            add_equivalence(filename_key, canonical_slug)
-print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
+print(f"{len(canonical_pages)} pages canoniques")
-print(f"{category_replaced} 'category_*' remplacées par leur version de base")
+print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
 print(f"{category_renamed} pages prefix 'category_*' renommées")
 print(f"{len(potential_tags)} potential_tags enregistrés")
 # --------------------------------------------------
 # PASS 3 — resolve redirects
@ -320,6 +354,16 @@ redirects.clear()
 # PASS 4 — normalisation finale des equivalences
 # --------------------------------------------------
 def resolve_equivalence(key):
    seen = set()
    while key in equivalences and key not in seen:
        seen.add(key)
        key = equivalences[key]
    return key
 for k in list(equivalences):
    equivalences[k] = resolve_equivalence(equivalences[k])
 valid_titles = {
    data["title"]
    for data in canonical_pages.values()
@ -338,10 +382,14 @@ for k, v in list(equivalences.items()):
 for k, v in equivalences.items():
    if v not in valid_titles:
        problems.append(f"Non canonical mapping: {k} -> {v}")
 equivalences = {
    k: v for k, v in equivalences.items()
    if k != v
 }
 for k in list(equivalences):
    equivalences[k] = resolve_equivalence(equivalences[k])
 # --------------------------------------------------
 # PASS 5 — copie des pages canoniques
 # --------------------------------------------------
@ -356,16 +404,15 @@ def title_to_filename(title: str) -> str:
 copied = 0
 total = len(canonical_pages)
-for i, (key, data) in enumerate(canonical_pages.items(), 1):
+for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
    src = data["path"]
-
+    dst_name = title_to_filename(data["title"])
    dst_name = sanitize_filename(src.name.casefold())
    dst = PAGES_DIR / dst_name
    try:
        shutil.copy2(src, dst)
-        canonical_pages[key] = dst_name
+        canonical_pages[article_id] = dst_name
        copied += 1
    except Exception as e:
        problems.append(f"Copy failed {src}: {e}")
@ -383,6 +430,7 @@ registry = {
    "canonical_pages": canonical_pages,
    "equivalences": equivalences,
    "redirects": redirects,
    "potential_tags": potential_tags,
    "ignored_pages": ignored_pages,
 }
@ -401,7 +449,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
    f.write(f"Redirects: {len(redirects)}\n")
    f.write(f"Ignored: {len(ignored_pages)}\n")
    f.write(f"Problems: {len(problems)}\n\n")
-    for p in problems[:200]:
+    for p in problems:
        f.write(p + "\n")
 print("\n✅ PREPARATION COMPLETE")