capture potential_tags

2026-04-09 12:05:15 +02:00 · 2026-04-09 12:05:15 +02:00 · 556d6f1e03
commit 556d6f1e03
parent 7f019ed98c
1 changed files with 102 additions and 54 deletions
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@ -6,6 +6,7 @@ import html
 from pathlib import Path
 from collections import defaultdict
 from difflib import SequenceMatcher
+from bs4 import BeautifulSoup

 SOURCE_DIR = Path("../original_index")
 OUTPUT_DIR = Path("../output")
@ -50,18 +51,18 @@ def extract_wg_page_name(page_html: str) -> str | None:
    return None


-def extract_page_identity(html: str):
-    page = extract_wg_page_name(html)
+def extract_page_identity(page_html: str):
+    page = extract_wg_page_name(page_html)
    if page:
        return page
-    m = re.search(r"<title>(.*?) -", html, re.I)
+    m = re.search(r"<title>(.*?) -", page_html, re.I)
    if m:
        return html.unescape(m.group(1))
    return None


-def extract_article_id(html: str) -> int | None:
-    m = ARTICLE_ID_RE.search(html)
+def extract_article_id(page_html: str) -> int | None:
+    m = ARTICLE_ID_RE.search(page_html)
    if m:
        aid = int(m.group(1))
        if aid > 0:
@ -76,8 +77,8 @@ def extract_internal_redirect(page_html: str):
    return None


-def extract_namespace(html: str) -> str:
-    m = NAMESPACE_RE.search(html)
+def extract_namespace(page_html: str) -> str:
+    m = NAMESPACE_RE.search(page_html)
    if m:
        return m.group(1)
    return ""
@ -102,6 +103,32 @@ def normalize_reference_key(key: str) -> str:
    return key.strip()


+
+
+def has_editorial_content(html_page: str) -> bool:
+    soup = BeautifulSoup(html_page, "html.parser")
+
+    content = soup.find(id="mw-content-text")
+    if not content:
+        return False
+
+    auto = content.select_one(".mw-category-generated")
+    if not auto:
+        return True  # pas une catégorie auto
+
+    # texte AVANT le listing
+    editorial_text = ""
+
+    for child in content.children:
+        if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []):
+            break
+        editorial_text += child.get_text(" ", strip=True)
+
+    editorial_text = editorial_text.strip()
+
+    return len(editorial_text) > 200
+
+
 # --------------------------------------------------
 # Registry structures
 # --------------------------------------------------
@ -146,6 +173,8 @@ for i, path in enumerate(files, 1):
        base_title = norm
        is_redirect = bool(IS_REDIRECT_RE.search(page_html))
        is_category = ns == "Category" or norm.startswith("category:")
+        has_content = has_editorial_content(page_html)
+        is_listing_only = is_category and not has_content
        wg_title = extract_wg_title(page_html)

        # Categories
@ -178,11 +207,12 @@ for i, path in enumerate(files, 1):
        all_variants[article_id].append({
            "path": path,
            "title": base_title,
-            "canonical_key": full_title,
+            "canonical_key": canonical_key,
            "article_id": article_id,
            "wg_title": normalize_title(wg_title) if wg_title else None,
            "redirect": is_redirect,
            "is_category": is_category,
+            "is_listing_only": is_listing_only,
        })

    except Exception as e:
@ -197,9 +227,10 @@ print("Variants collected:", len(all_variants))
 # --------------------------------------------------

 canonical_pages = {}
+potential_tags = defaultdict(list)
 equivalences = {}
-category_replaced = 0
-nb_all_cat = 0
+category_renamed = 0
+category_not_chosen = 0


 def slug_to_title(filename: str) -> str:
@ -241,7 +272,7 @@ def variant_score(v):
    )

    return (
-        v["is_category"],
+        v["is_listing_only"],
        v["redirect"],
        not is_short_slug,
        long_title_penalty,
@ -250,35 +281,6 @@ def variant_score(v):
        filename.lower(),
    )

-
-for article_id, variants in all_variants.items():
-
-    # tri déterministe
-    variants_sorted = sorted(variants, key=variant_score)
-    print(f"variants_sorted: {variants_sorted}")
-
-    chosen = variants_sorted[0]
-
-    if all(v["is_category"] for v in variants):
-        nb_all_cat += 1
-
-    if chosen["is_category"]:
-        category_replaced += 1
-
-    canonical_title = normalize_reference_key(chosen["title"])
-    
-    canonical_pages[article_id] = {
-        "path": chosen["path"],
-        "title": canonical_title,
-        "redirect": chosen["redirect"],
-    }
-
-    # équivalences
-    for v in variants:
-        equivalences[v["canonical_key"]] = chosen["title"]
-
-equivalences.clear()
-
 def add_equivalence(k, v):
    k = normalize_reference_key(k)
    v = normalize_reference_key(v)
@ -286,16 +288,48 @@ def add_equivalence(k, v):
    if k != v:
        equivalences[k] = v

-for article_id, variants in all_variants.items():
-    canonical_title = canonical_pages[article_id]["title"]
-    canonical_slug = Path(canonical_pages[article_id]["path"]).stem
-    for v in variants:
-        add_equivalence(v["canonical_key"], canonical_slug)
-        filename_key = normalize_title(Path(v["path"]).stem)
-        add_equivalence(filename_key, canonical_slug)

-print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
-print(f"{category_replaced} 'category_*' remplacées par leur version de base")
+for article_id, variants in all_variants.items():
+    variants_sorted = sorted(variants, key=variant_score)
+    chosen = variants_sorted[0]
+
+    canonical_slug = normalize_reference_key(chosen["path"].stem)
+
+    # categories listing-only
+    if chosen["is_listing_only"]:
+        tag_name = normalize_reference_key(chosen["title"])
+        for v in variants:
+            potential_tags[tag_name].append(normalize_title(v["path"].stem))
+            if v["wg_title"]:
+                potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
+        continue
+
+    canonical_pages[article_id] = {
+        "path": chosen["path"],
+        "title": canonical_slug,
+        "redirect": chosen["redirect"],
+    }
+
+    if chosen["wg_title"]:
+        add_equivalence(chosen["wg_title"], canonical_slug)
+
+    for v in variants:
+        if v["is_category"] and not v["is_listing_only"]:
+            # catégorie non choisie
+            if v is not chosen:
+                category_not_chosen += 1
+            # catégorie choisie mais qui est une category_* → renommée
+            elif chosen["path"].stem.lower().startswith("category"):
+                category_renamed += 1
+
+        if v is not chosen:
+            filename_key = normalize_title(Path(v["path"]).stem)
+            add_equivalence(filename_key, canonical_slug)
+
+print(f"{len(canonical_pages)} pages canoniques")
+print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
+print(f"{category_renamed} pages prefix 'category_*' renommées")
+print(f"{len(potential_tags)} potential_tags enregistrés")

 # --------------------------------------------------
 # PASS 3 — resolve redirects
@ -320,6 +354,16 @@ redirects.clear()
 # PASS 4 — normalisation finale des equivalences
 # --------------------------------------------------

+def resolve_equivalence(key):
+    seen = set()
+    while key in equivalences and key not in seen:
+        seen.add(key)
+        key = equivalences[key]
+    return key
+
+for k in list(equivalences):
+    equivalences[k] = resolve_equivalence(equivalences[k])
+
 valid_titles = {
    data["title"]
    for data in canonical_pages.values()
@ -338,10 +382,14 @@ for k, v in list(equivalences.items()):
 for k, v in equivalences.items():
    if v not in valid_titles:
        problems.append(f"Non canonical mapping: {k} -> {v}")
+
 equivalences = {
    k: v for k, v in equivalences.items()
    if k != v
 }
+
+for k in list(equivalences):
+    equivalences[k] = resolve_equivalence(equivalences[k])
 # --------------------------------------------------
 # PASS 5 — copie des pages canoniques
 # --------------------------------------------------
@ -356,16 +404,15 @@ def title_to_filename(title: str) -> str:
 copied = 0
 total = len(canonical_pages)

-for i, (key, data) in enumerate(canonical_pages.items(), 1):
+for i, (article_id, data) in enumerate(canonical_pages.items(), 1):

    src = data["path"]
-
-    dst_name = sanitize_filename(src.name.casefold())
+    dst_name = title_to_filename(data["title"])
    dst = PAGES_DIR / dst_name

    try:
        shutil.copy2(src, dst)
-        canonical_pages[key] = dst_name
+        canonical_pages[article_id] = dst_name
        copied += 1
    except Exception as e:
        problems.append(f"Copy failed {src}: {e}")
@ -383,6 +430,7 @@ registry = {
    "canonical_pages": canonical_pages,
    "equivalences": equivalences,
    "redirects": redirects,
+    "potential_tags": potential_tags,
    "ignored_pages": ignored_pages,
 }

@ -401,7 +449,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
    f.write(f"Redirects: {len(redirects)}\n")
    f.write(f"Ignored: {len(ignored_pages)}\n")
    f.write(f"Problems: {len(problems)}\n\n")
-    for p in problems[:200]:
+    for p in problems:
        f.write(p + "\n")

 print("\n✅ PREPARATION COMPLETE")