diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py
index 4b0ac16..b5784fb 100644
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@@ -48,12 +48,9 @@ def extract_page_identity(html: str):
page = extract_wg_page_name(html)
if page:
return page
-
- # fallback title tag
m = re.search(r"
(.*?) -", html, re.I)
if m:
return html.unescape(m.group(1))
-
return None
@@ -66,18 +63,6 @@ def extract_article_id(html: str) -> int | None:
return None
-def extract_redirect(html: str) -> str | None:
- m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I)
- if m:
- return m.group(1).strip()
- return None
-
-
-def namespace_of(title: str):
- if ":" in title:
- return title.split(":", 1)[0]
-
-
def extract_internal_redirect(page_html: str):
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
if m:
@@ -91,26 +76,24 @@ def extract_namespace(html: str) -> str:
return m.group(1)
return ""
+
# --------------------------------------------------
# Registry structures
# --------------------------------------------------
-canonical_pages = {}
-equivalences = {}
-redirects = {}
ignored_pages = []
problems = []
+redirects = {}
+all_variants = defaultdict(list)
files = list(SOURCE_DIR.glob("*.html"))
print(f"{len(files)} fichiers trouvés")
-
# --------------------------------------------------
-# PASS 1 — analyse
+# PASS 1 — analyse et collecte des variantes
# --------------------------------------------------
for i, path in enumerate(files, 1):
-
try:
page_html = path.read_text(encoding="utf-8", errors="ignore")
@@ -126,50 +109,51 @@ for i, path in enumerate(files, 1):
ns = extract_namespace(page_html)
- # -------------------------
- # Ignore namespaces
- # -------------------------
+ # Ignorer certains namespaces
if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
ignored_pages.append(path.name)
continue
title = html.unescape(title)
norm = normalize_title(title)
+ base_title = norm
+ is_redirect = bool(IS_REDIRECT_RE.search(page_html))
+ is_category = ns == "Category" or norm.startswith("category:")
-
- # -------------------------
- # Category pages
- # -------------------------
- # Category pages CAN be canonical content
- if ns == "Category":
- norm = normalize_title(title)
- equivalences[norm] = norm
-
- # -------------------------
- # Redirect detection
- # -------------------------
+ # redirect interne
redir = extract_internal_redirect(page_html)
if redir:
redirects[normalize_title(redir)] = norm
- # -------------------------
- # Canonical article
- # -------------------------
- is_redirect = bool(IS_REDIRECT_RE.search(page_html))
- if article_id not in canonical_pages:
- canonical_pages[article_id] = {
- "path": path,
- "title": norm,
- "redirect": is_redirect,
- }
- elif canonical_pages[article_id]["redirect"] and not is_redirect:
- canonical_pages[article_id] = {
- "path": path,
- "title": norm,
- "redirect": is_redirect,
- }
- # self equivalence
- equivalences[norm] = norm
+ # Categories
+ if ns == "Category":
+ m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
+
+ if m_title:
+ wg_title = html.unescape(m_title.group(1))
+ cat_base = normalize_title(wg_title)
+
+ page_name = extract_wg_page_name(page_html)
+ page_norm = normalize_title(page_name) if page_name else None
+
+ if page_norm and page_norm != f"category:{cat_base}":
+ # page réelle déguisée en category
+ base_title = page_norm
+ is_category = False
+ else:
+ base_title = cat_base
+ is_category = True
+ else:
+ base_title = norm.replace("category:", "", 1)
+ is_category = True
+
+ all_variants[article_id].append({
+ "path": path,
+ "title": base_title,
+ "article_id": article_id,
+ "redirect": is_redirect,
+ "is_category": is_category,
+ })
except Exception as e:
problems.append(f"{path}: {e}")
@@ -177,9 +161,57 @@ for i, path in enumerate(files, 1):
if i % 200 == 0:
print(f"{i}/{len(files)} analysés")
+# --------------------------------------------------
+# PASS 2 — choix des versions canoniques
+# --------------------------------------------------
+
+canonical_pages = {}
+equivalences = {}
+category_replaced = 0
+nb_all_cat = 0
+
+
+def variant_score(v):
+ """
+ Plus le score est petit → meilleur candidat.
+ """
+ return (
+ v["is_category"], # False (0) meilleur que True (1)
+ v["redirect"], # False meilleur
+ "category:" in v["path"].name.lower(), # sécurité filename
+ len(v["path"].name), # stabilité
+ )
+
+
+for article_id, variants in all_variants.items():
+
+ # tri déterministe
+ variants_sorted = sorted(variants, key=variant_score)
+
+ chosen = variants_sorted[0]
+
+ if all(v["is_category"] for v in variants):
+ nb_all_cat += 1
+
+ if chosen["is_category"]:
+ category_replaced += 1
+
+ canonical_pages[article_id] = {
+ "path": chosen["path"],
+ "title": chosen["title"],
+ "redirect": chosen["redirect"],
+ }
+
+ # équivalences
+ for v in variants:
+ equivalences[v["title"]] = chosen["title"]
+
+
+print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
+print(f"{category_replaced} 'category_*' remplacées par leur version de base")
# --------------------------------------------------
-# PASS 2 — resolve redirects
+# PASS 3 — resolve redirects
# --------------------------------------------------
def resolve_redirect(key):
@@ -189,24 +221,18 @@ def resolve_redirect(key):
key = redirects[key]
return key
-
for k, v in list(redirects.items()):
equivalences[k] = resolve_redirect(v)
-
# --------------------------------------------------
-# PASS 3 — copy canonical pages
+# PASS 4 — copie des pages canoniques
# --------------------------------------------------
copied = 0
-
-# for key, src in canonical_pages.items():
for key, data in canonical_pages.items():
-
src = data["path"]
dst_name = sanitize_filename(src.name)
dst = PAGES_DIR / dst_name
-
try:
shutil.copy2(src, dst)
canonical_pages[key] = dst_name
@@ -216,7 +242,6 @@ for key, data in canonical_pages.items():
print(f"{copied} pages copiées")
-
# --------------------------------------------------
# SAVE REGISTRY
# --------------------------------------------------
@@ -229,11 +254,9 @@ registry = {
}
REGISTRY_PATH.parent.mkdir(exist_ok=True)
-
with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
json.dump(registry, f, indent=2, ensure_ascii=False)
-
# --------------------------------------------------
# REPORT
# --------------------------------------------------
@@ -245,7 +268,6 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write(f"Redirects: {len(redirects)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n")
-
for p in problems[:200]:
f.write(p + "\n")