From 556d6f1e035ee5b927c4a9c0c295db9a5c060a55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maxime=20R=C3=A9aux?= <maxime.reaux@advans-group.com>
Date: Thu, 9 Apr 2026 12:05:15 +0200
Subject: [PATCH] capture potential_tags

---
 prepare_pages_and_registry.py | 156 ++++++++++++++++++++++------------
 1 file changed, 102 insertions(+), 54 deletions(-)
diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py
index 1b3ea96..ee391ba 100644
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@@ -6,6 +6,7 @@ import html
 from pathlib import Path
 from collections import defaultdict
 from difflib import SequenceMatcher
+from bs4 import BeautifulSoup
 
 SOURCE_DIR = Path("../original_index")
 OUTPUT_DIR = Path("../output")
@@ -50,18 +51,18 @@ def extract_wg_page_name(page_html: str) -> str | None:
     return None
 
 
-def extract_page_identity(html: str):
-    page = extract_wg_page_name(html)
+def extract_page_identity(page_html: str):
+    page = extract_wg_page_name(page_html)
     if page:
         return page
-    m = re.search(r"<title>(.*?) -", html, re.I)
+    m = re.search(r"<title>(.*?) -", page_html, re.I)
     if m:
         return html.unescape(m.group(1))
     return None
 
 
-def extract_article_id(html: str) -> int | None:
-    m = ARTICLE_ID_RE.search(html)
+def extract_article_id(page_html: str) -> int | None:
+    m = ARTICLE_ID_RE.search(page_html)
     if m:
         aid = int(m.group(1))
         if aid > 0:
@@ -76,8 +77,8 @@ def extract_internal_redirect(page_html: str):
     return None
 
 
-def extract_namespace(html: str) -> str:
-    m = NAMESPACE_RE.search(html)
+def extract_namespace(page_html: str) -> str:
+    m = NAMESPACE_RE.search(page_html)
     if m:
         return m.group(1)
     return ""
@@ -102,6 +103,32 @@ def normalize_reference_key(key: str) -> str:
     return key.strip()
 
 
+
+
+def has_editorial_content(html_page: str) -> bool:
+    soup = BeautifulSoup(html_page, "html.parser")
+
+    content = soup.find(id="mw-content-text")
+    if not content:
+        return False
+
+    auto = content.select_one(".mw-category-generated")
+    if not auto:
+        return True  # pas une catégorie auto
+
+    # texte AVANT le listing
+    editorial_text = ""
+
+    for child in content.children:
+        if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []):
+            break
+        editorial_text += child.get_text(" ", strip=True)
+
+    editorial_text = editorial_text.strip()
+
+    return len(editorial_text) > 200
+
+
 # --------------------------------------------------
 # Registry structures
 # --------------------------------------------------
@@ -146,6 +173,8 @@ for i, path in enumerate(files, 1):
         base_title = norm
         is_redirect = bool(IS_REDIRECT_RE.search(page_html))
         is_category = ns == "Category" or norm.startswith("category:")
+        has_content = has_editorial_content(page_html)
+        is_listing_only = is_category and not has_content
         wg_title = extract_wg_title(page_html)
 
         # Categories
@@ -178,11 +207,12 @@ for i, path in enumerate(files, 1):
         all_variants[article_id].append({
             "path": path,
             "title": base_title,
-            "canonical_key": full_title,
+            "canonical_key": canonical_key,
             "article_id": article_id,
             "wg_title": normalize_title(wg_title) if wg_title else None,
             "redirect": is_redirect,
             "is_category": is_category,
+            "is_listing_only": is_listing_only,
         })
 
     except Exception as e:
@@ -197,9 +227,10 @@ print("Variants collected:", len(all_variants))
 # --------------------------------------------------
 
 canonical_pages = {}
+potential_tags = defaultdict(list)
 equivalences = {}
-category_replaced = 0
-nb_all_cat = 0
+category_renamed = 0
+category_not_chosen = 0
 
 
 def slug_to_title(filename: str) -> str:
@@ -241,7 +272,7 @@ def variant_score(v):
     )
 
     return (
-        v["is_category"],
+        v["is_listing_only"],
         v["redirect"],
         not is_short_slug,
         long_title_penalty,
@@ -250,35 +281,6 @@ def variant_score(v):
         filename.lower(),
     )
 
-
-for article_id, variants in all_variants.items():
-
-    # tri déterministe
-    variants_sorted = sorted(variants, key=variant_score)
-    print(f"variants_sorted: {variants_sorted}")
-
-    chosen = variants_sorted[0]
-
-    if all(v["is_category"] for v in variants):
-        nb_all_cat += 1
-
-    if chosen["is_category"]:
-        category_replaced += 1
-
-    canonical_title = normalize_reference_key(chosen["title"])
-    
-    canonical_pages[article_id] = {
-        "path": chosen["path"],
-        "title": canonical_title,
-        "redirect": chosen["redirect"],
-    }
-
-    # équivalences
-    for v in variants:
-        equivalences[v["canonical_key"]] = chosen["title"]
-
-equivalences.clear()
-
 def add_equivalence(k, v):
     k = normalize_reference_key(k)
     v = normalize_reference_key(v)
@@ -286,16 +288,48 @@ def add_equivalence(k, v):
     if k != v:
         equivalences[k] = v
 
-for article_id, variants in all_variants.items():
-    canonical_title = canonical_pages[article_id]["title"]
-    canonical_slug = Path(canonical_pages[article_id]["path"]).stem
-    for v in variants:
-        add_equivalence(v["canonical_key"], canonical_slug)
-        filename_key = normalize_title(Path(v["path"]).stem)
-        add_equivalence(filename_key, canonical_slug)
 
-print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
-print(f"{category_replaced} 'category_*' remplacées par leur version de base")
+for article_id, variants in all_variants.items():
+    variants_sorted = sorted(variants, key=variant_score)
+    chosen = variants_sorted[0]
+
+    canonical_slug = normalize_reference_key(chosen["path"].stem)
+
+    # categories listing-only
+    if chosen["is_listing_only"]:
+        tag_name = normalize_reference_key(chosen["title"])
+        for v in variants:
+            potential_tags[tag_name].append(normalize_title(v["path"].stem))
+            if v["wg_title"]:
+                potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
+        continue
+
+    canonical_pages[article_id] = {
+        "path": chosen["path"],
+        "title": canonical_slug,
+        "redirect": chosen["redirect"],
+    }
+
+    if chosen["wg_title"]:
+        add_equivalence(chosen["wg_title"], canonical_slug)
+
+    for v in variants:
+        if v["is_category"] and not v["is_listing_only"]:
+            # catégorie non choisie
+            if v is not chosen:
+                category_not_chosen += 1
+            # catégorie choisie mais qui est une category_* → renommée
+            elif chosen["path"].stem.lower().startswith("category"):
+                category_renamed += 1
+
+        if v is not chosen:
+            filename_key = normalize_title(Path(v["path"]).stem)
+            add_equivalence(filename_key, canonical_slug)
+
+print(f"{len(canonical_pages)} pages canoniques")
+print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
+print(f"{category_renamed} pages prefix 'category_*' renommées")
+print(f"{len(potential_tags)} potential_tags enregistrés")
 
 # --------------------------------------------------
 # PASS 3 — resolve redirects
@@ -320,6 +354,16 @@ redirects.clear()
 # PASS 4 — normalisation finale des equivalences
 # --------------------------------------------------
 
+def resolve_equivalence(key):
+    seen = set()
+    while key in equivalences and key not in seen:
+        seen.add(key)
+        key = equivalences[key]
+    return key
+
+for k in list(equivalences):
+    equivalences[k] = resolve_equivalence(equivalences[k])
+
 valid_titles = {
     data["title"]
     for data in canonical_pages.values()
@@ -338,10 +382,14 @@ for k, v in list(equivalences.items()):
 for k, v in equivalences.items():
     if v not in valid_titles:
         problems.append(f"Non canonical mapping: {k} -> {v}")
+
 equivalences = {
     k: v for k, v in equivalences.items()
     if k != v
 }
+
+for k in list(equivalences):
+    equivalences[k] = resolve_equivalence(equivalences[k])
 # --------------------------------------------------
 # PASS 5 — copie des pages canoniques
 # --------------------------------------------------
@@ -356,16 +404,15 @@ def title_to_filename(title: str) -> str:
 copied = 0
 total = len(canonical_pages)
 
-for i, (key, data) in enumerate(canonical_pages.items(), 1):
+for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
 
     src = data["path"]
-
-    dst_name = sanitize_filename(src.name.casefold())
+    dst_name = title_to_filename(data["title"])
     dst = PAGES_DIR / dst_name
 
     try:
         shutil.copy2(src, dst)
-        canonical_pages[key] = dst_name
+        canonical_pages[article_id] = dst_name
         copied += 1
     except Exception as e:
         problems.append(f"Copy failed {src}: {e}")
@@ -383,6 +430,7 @@ registry = {
     "canonical_pages": canonical_pages,
     "equivalences": equivalences,
     "redirects": redirects,
+    "potential_tags": potential_tags,
     "ignored_pages": ignored_pages,
 }
 
@@ -401,7 +449,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
     f.write(f"Redirects: {len(redirects)}\n")
     f.write(f"Ignored: {len(ignored_pages)}\n")
     f.write(f"Problems: {len(problems)}\n\n")
-    for p in problems[:200]:
+    for p in problems:
         f.write(p + "\n")
 
 print("\n✅ PREPARATION COMPLETE")
\ No newline at end of file