diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py
index 1b3ea96..ee391ba 100644
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@@ -6,6 +6,7 @@ import html
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
+from bs4 import BeautifulSoup
SOURCE_DIR = Path("../original_index")
OUTPUT_DIR = Path("../output")
@@ -50,18 +51,18 @@ def extract_wg_page_name(page_html: str) -> str | None:
return None
-def extract_page_identity(html: str):
- page = extract_wg_page_name(html)
+def extract_page_identity(page_html: str):
+ page = extract_wg_page_name(page_html)
if page:
return page
- m = re.search(r"
(.*?) -", html, re.I)
+ m = re.search(r"(.*?) -", page_html, re.I)
if m:
return html.unescape(m.group(1))
return None
-def extract_article_id(html: str) -> int | None:
- m = ARTICLE_ID_RE.search(html)
+def extract_article_id(page_html: str) -> int | None:
+ m = ARTICLE_ID_RE.search(page_html)
if m:
aid = int(m.group(1))
if aid > 0:
@@ -76,8 +77,8 @@ def extract_internal_redirect(page_html: str):
return None
-def extract_namespace(html: str) -> str:
- m = NAMESPACE_RE.search(html)
+def extract_namespace(page_html: str) -> str:
+ m = NAMESPACE_RE.search(page_html)
if m:
return m.group(1)
return ""
@@ -102,6 +103,32 @@ def normalize_reference_key(key: str) -> str:
return key.strip()
+
+
+def has_editorial_content(html_page: str) -> bool:
+ soup = BeautifulSoup(html_page, "html.parser")
+
+ content = soup.find(id="mw-content-text")
+ if not content:
+ return False
+
+ auto = content.select_one(".mw-category-generated")
+ if not auto:
+ return True # pas une catégorie auto
+
+ # texte AVANT le listing
+ editorial_text = ""
+
+ for child in content.children:
+ if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []):
+ break
+ editorial_text += child.get_text(" ", strip=True)
+
+ editorial_text = editorial_text.strip()
+
+ return len(editorial_text) > 200
+
+
# --------------------------------------------------
# Registry structures
# --------------------------------------------------
@@ -146,6 +173,8 @@ for i, path in enumerate(files, 1):
base_title = norm
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
is_category = ns == "Category" or norm.startswith("category:")
+ has_content = has_editorial_content(page_html)
+ is_listing_only = is_category and not has_content
wg_title = extract_wg_title(page_html)
# Categories
@@ -178,11 +207,12 @@ for i, path in enumerate(files, 1):
all_variants[article_id].append({
"path": path,
"title": base_title,
- "canonical_key": full_title,
+ "canonical_key": canonical_key,
"article_id": article_id,
"wg_title": normalize_title(wg_title) if wg_title else None,
"redirect": is_redirect,
"is_category": is_category,
+ "is_listing_only": is_listing_only,
})
except Exception as e:
@@ -197,9 +227,10 @@ print("Variants collected:", len(all_variants))
# --------------------------------------------------
canonical_pages = {}
+potential_tags = defaultdict(list)
equivalences = {}
-category_replaced = 0
-nb_all_cat = 0
+category_renamed = 0
+category_not_chosen = 0
def slug_to_title(filename: str) -> str:
@@ -241,7 +272,7 @@ def variant_score(v):
)
return (
- v["is_category"],
+ v["is_listing_only"],
v["redirect"],
not is_short_slug,
long_title_penalty,
@@ -250,35 +281,6 @@ def variant_score(v):
filename.lower(),
)
-
-for article_id, variants in all_variants.items():
-
- # tri déterministe
- variants_sorted = sorted(variants, key=variant_score)
- print(f"variants_sorted: {variants_sorted}")
-
- chosen = variants_sorted[0]
-
- if all(v["is_category"] for v in variants):
- nb_all_cat += 1
-
- if chosen["is_category"]:
- category_replaced += 1
-
- canonical_title = normalize_reference_key(chosen["title"])
-
- canonical_pages[article_id] = {
- "path": chosen["path"],
- "title": canonical_title,
- "redirect": chosen["redirect"],
- }
-
- # équivalences
- for v in variants:
- equivalences[v["canonical_key"]] = chosen["title"]
-
-equivalences.clear()
-
def add_equivalence(k, v):
k = normalize_reference_key(k)
v = normalize_reference_key(v)
@@ -286,16 +288,48 @@ def add_equivalence(k, v):
if k != v:
equivalences[k] = v
-for article_id, variants in all_variants.items():
- canonical_title = canonical_pages[article_id]["title"]
- canonical_slug = Path(canonical_pages[article_id]["path"]).stem
- for v in variants:
- add_equivalence(v["canonical_key"], canonical_slug)
- filename_key = normalize_title(Path(v["path"]).stem)
- add_equivalence(filename_key, canonical_slug)
-print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
-print(f"{category_replaced} 'category_*' remplacées par leur version de base")
+for article_id, variants in all_variants.items():
+ variants_sorted = sorted(variants, key=variant_score)
+ chosen = variants_sorted[0]
+
+ canonical_slug = normalize_reference_key(chosen["path"].stem)
+
+ # categories listing-only
+ if chosen["is_listing_only"]:
+ tag_name = normalize_reference_key(chosen["title"])
+ for v in variants:
+ potential_tags[tag_name].append(normalize_title(v["path"].stem))
+ if v["wg_title"]:
+ potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
+ continue
+
+ canonical_pages[article_id] = {
+ "path": chosen["path"],
+ "title": canonical_slug,
+ "redirect": chosen["redirect"],
+ }
+
+ if chosen["wg_title"]:
+ add_equivalence(chosen["wg_title"], canonical_slug)
+
+ for v in variants:
+ if v["is_category"] and not v["is_listing_only"]:
+ # catégorie non choisie
+ if v is not chosen:
+ category_not_chosen += 1
+ # catégorie choisie mais qui est une category_* → renommée
+ elif chosen["path"].stem.lower().startswith("category"):
+ category_renamed += 1
+
+ if v is not chosen:
+ filename_key = normalize_title(Path(v["path"]).stem)
+ add_equivalence(filename_key, canonical_slug)
+
+print(f"{len(canonical_pages)} pages canoniques")
+print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
+print(f"{category_renamed} pages prefix 'category_*' renommées")
+print(f"{len(potential_tags)} potential_tags enregistrés")
# --------------------------------------------------
# PASS 3 — resolve redirects
@@ -320,6 +354,16 @@ redirects.clear()
# PASS 4 — normalisation finale des equivalences
# --------------------------------------------------
+def resolve_equivalence(key):
+ seen = set()
+ while key in equivalences and key not in seen:
+ seen.add(key)
+ key = equivalences[key]
+ return key
+
+for k in list(equivalences):
+ equivalences[k] = resolve_equivalence(equivalences[k])
+
valid_titles = {
data["title"]
for data in canonical_pages.values()
@@ -338,10 +382,14 @@ for k, v in list(equivalences.items()):
for k, v in equivalences.items():
if v not in valid_titles:
problems.append(f"Non canonical mapping: {k} -> {v}")
+
equivalences = {
k: v for k, v in equivalences.items()
if k != v
}
+
+for k in list(equivalences):
+ equivalences[k] = resolve_equivalence(equivalences[k])
# --------------------------------------------------
# PASS 5 — copie des pages canoniques
# --------------------------------------------------
@@ -356,16 +404,15 @@ def title_to_filename(title: str) -> str:
copied = 0
total = len(canonical_pages)
-for i, (key, data) in enumerate(canonical_pages.items(), 1):
+for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
src = data["path"]
-
- dst_name = sanitize_filename(src.name.casefold())
+ dst_name = title_to_filename(data["title"])
dst = PAGES_DIR / dst_name
try:
shutil.copy2(src, dst)
- canonical_pages[key] = dst_name
+ canonical_pages[article_id] = dst_name
copied += 1
except Exception as e:
problems.append(f"Copy failed {src}: {e}")
@@ -383,6 +430,7 @@ registry = {
"canonical_pages": canonical_pages,
"equivalences": equivalences,
"redirects": redirects,
+ "potential_tags": potential_tags,
"ignored_pages": ignored_pages,
}
@@ -401,7 +449,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write(f"Redirects: {len(redirects)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n")
- for p in problems[:200]:
+ for p in problems:
f.write(p + "\n")
print("\n✅ PREPARATION COMPLETE")
\ No newline at end of file