capture potential_tags

This commit is contained in:
Maxime Réaux 2026-04-09 12:05:15 +02:00
parent 7f019ed98c
commit 556d6f1e03

View file

@ -6,6 +6,7 @@ import html
from pathlib import Path from pathlib import Path
from collections import defaultdict from collections import defaultdict
from difflib import SequenceMatcher from difflib import SequenceMatcher
from bs4 import BeautifulSoup
SOURCE_DIR = Path("../original_index") SOURCE_DIR = Path("../original_index")
OUTPUT_DIR = Path("../output") OUTPUT_DIR = Path("../output")
@ -50,18 +51,18 @@ def extract_wg_page_name(page_html: str) -> str | None:
return None return None
def extract_page_identity(html: str): def extract_page_identity(page_html: str):
page = extract_wg_page_name(html) page = extract_wg_page_name(page_html)
if page: if page:
return page return page
m = re.search(r"<title>(.*?) -", html, re.I) m = re.search(r"<title>(.*?) -", page_html, re.I)
if m: if m:
return html.unescape(m.group(1)) return html.unescape(m.group(1))
return None return None
def extract_article_id(html: str) -> int | None: def extract_article_id(page_html: str) -> int | None:
m = ARTICLE_ID_RE.search(html) m = ARTICLE_ID_RE.search(page_html)
if m: if m:
aid = int(m.group(1)) aid = int(m.group(1))
if aid > 0: if aid > 0:
@ -76,8 +77,8 @@ def extract_internal_redirect(page_html: str):
return None return None
def extract_namespace(html: str) -> str: def extract_namespace(page_html: str) -> str:
m = NAMESPACE_RE.search(html) m = NAMESPACE_RE.search(page_html)
if m: if m:
return m.group(1) return m.group(1)
return "" return ""
@ -102,6 +103,32 @@ def normalize_reference_key(key: str) -> str:
return key.strip() return key.strip()
def has_editorial_content(html_page: str) -> bool:
soup = BeautifulSoup(html_page, "html.parser")
content = soup.find(id="mw-content-text")
if not content:
return False
auto = content.select_one(".mw-category-generated")
if not auto:
return True # pas une catégorie auto
# texte AVANT le listing
editorial_text = ""
for child in content.children:
if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []):
break
editorial_text += child.get_text(" ", strip=True)
editorial_text = editorial_text.strip()
return len(editorial_text) > 200
# -------------------------------------------------- # --------------------------------------------------
# Registry structures # Registry structures
# -------------------------------------------------- # --------------------------------------------------
@ -146,6 +173,8 @@ for i, path in enumerate(files, 1):
base_title = norm base_title = norm
is_redirect = bool(IS_REDIRECT_RE.search(page_html)) is_redirect = bool(IS_REDIRECT_RE.search(page_html))
is_category = ns == "Category" or norm.startswith("category:") is_category = ns == "Category" or norm.startswith("category:")
has_content = has_editorial_content(page_html)
is_listing_only = is_category and not has_content
wg_title = extract_wg_title(page_html) wg_title = extract_wg_title(page_html)
# Categories # Categories
@ -178,11 +207,12 @@ for i, path in enumerate(files, 1):
all_variants[article_id].append({ all_variants[article_id].append({
"path": path, "path": path,
"title": base_title, "title": base_title,
"canonical_key": full_title, "canonical_key": canonical_key,
"article_id": article_id, "article_id": article_id,
"wg_title": normalize_title(wg_title) if wg_title else None, "wg_title": normalize_title(wg_title) if wg_title else None,
"redirect": is_redirect, "redirect": is_redirect,
"is_category": is_category, "is_category": is_category,
"is_listing_only": is_listing_only,
}) })
except Exception as e: except Exception as e:
@ -197,9 +227,10 @@ print("Variants collected:", len(all_variants))
# -------------------------------------------------- # --------------------------------------------------
canonical_pages = {} canonical_pages = {}
potential_tags = defaultdict(list)
equivalences = {} equivalences = {}
category_replaced = 0 category_renamed = 0
nb_all_cat = 0 category_not_chosen = 0
def slug_to_title(filename: str) -> str: def slug_to_title(filename: str) -> str:
@ -241,7 +272,7 @@ def variant_score(v):
) )
return ( return (
v["is_category"], v["is_listing_only"],
v["redirect"], v["redirect"],
not is_short_slug, not is_short_slug,
long_title_penalty, long_title_penalty,
@ -250,35 +281,6 @@ def variant_score(v):
filename.lower(), filename.lower(),
) )
for article_id, variants in all_variants.items():
# tri déterministe
variants_sorted = sorted(variants, key=variant_score)
print(f"variants_sorted: {variants_sorted}")
chosen = variants_sorted[0]
if all(v["is_category"] for v in variants):
nb_all_cat += 1
if chosen["is_category"]:
category_replaced += 1
canonical_title = normalize_reference_key(chosen["title"])
canonical_pages[article_id] = {
"path": chosen["path"],
"title": canonical_title,
"redirect": chosen["redirect"],
}
# équivalences
for v in variants:
equivalences[v["canonical_key"]] = chosen["title"]
equivalences.clear()
def add_equivalence(k, v): def add_equivalence(k, v):
k = normalize_reference_key(k) k = normalize_reference_key(k)
v = normalize_reference_key(v) v = normalize_reference_key(v)
@ -286,16 +288,48 @@ def add_equivalence(k, v):
if k != v: if k != v:
equivalences[k] = v equivalences[k] = v
for article_id, variants in all_variants.items():
canonical_title = canonical_pages[article_id]["title"]
canonical_slug = Path(canonical_pages[article_id]["path"]).stem
for v in variants:
add_equivalence(v["canonical_key"], canonical_slug)
filename_key = normalize_title(Path(v["path"]).stem)
add_equivalence(filename_key, canonical_slug)
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}") for article_id, variants in all_variants.items():
print(f"{category_replaced} 'category_*' remplacées par leur version de base") variants_sorted = sorted(variants, key=variant_score)
chosen = variants_sorted[0]
canonical_slug = normalize_reference_key(chosen["path"].stem)
# categories listing-only
if chosen["is_listing_only"]:
tag_name = normalize_reference_key(chosen["title"])
for v in variants:
potential_tags[tag_name].append(normalize_title(v["path"].stem))
if v["wg_title"]:
potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
continue
canonical_pages[article_id] = {
"path": chosen["path"],
"title": canonical_slug,
"redirect": chosen["redirect"],
}
if chosen["wg_title"]:
add_equivalence(chosen["wg_title"], canonical_slug)
for v in variants:
if v["is_category"] and not v["is_listing_only"]:
# catégorie non choisie
if v is not chosen:
category_not_chosen += 1
# catégorie choisie mais qui est une category_* → renommée
elif chosen["path"].stem.lower().startswith("category"):
category_renamed += 1
if v is not chosen:
filename_key = normalize_title(Path(v["path"]).stem)
add_equivalence(filename_key, canonical_slug)
print(f"{len(canonical_pages)} pages canoniques")
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
print(f"{category_renamed} pages prefix 'category_*' renommées")
print(f"{len(potential_tags)} potential_tags enregistrés")
# -------------------------------------------------- # --------------------------------------------------
# PASS 3 — resolve redirects # PASS 3 — resolve redirects
@ -320,6 +354,16 @@ redirects.clear()
# PASS 4 — normalisation finale des equivalences # PASS 4 — normalisation finale des equivalences
# -------------------------------------------------- # --------------------------------------------------
def resolve_equivalence(key):
seen = set()
while key in equivalences and key not in seen:
seen.add(key)
key = equivalences[key]
return key
for k in list(equivalences):
equivalences[k] = resolve_equivalence(equivalences[k])
valid_titles = { valid_titles = {
data["title"] data["title"]
for data in canonical_pages.values() for data in canonical_pages.values()
@ -338,10 +382,14 @@ for k, v in list(equivalences.items()):
for k, v in equivalences.items(): for k, v in equivalences.items():
if v not in valid_titles: if v not in valid_titles:
problems.append(f"Non canonical mapping: {k} -> {v}") problems.append(f"Non canonical mapping: {k} -> {v}")
equivalences = { equivalences = {
k: v for k, v in equivalences.items() k: v for k, v in equivalences.items()
if k != v if k != v
} }
for k in list(equivalences):
equivalences[k] = resolve_equivalence(equivalences[k])
# -------------------------------------------------- # --------------------------------------------------
# PASS 5 — copie des pages canoniques # PASS 5 — copie des pages canoniques
# -------------------------------------------------- # --------------------------------------------------
@ -356,16 +404,15 @@ def title_to_filename(title: str) -> str:
copied = 0 copied = 0
total = len(canonical_pages) total = len(canonical_pages)
for i, (key, data) in enumerate(canonical_pages.items(), 1): for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
src = data["path"] src = data["path"]
dst_name = title_to_filename(data["title"])
dst_name = sanitize_filename(src.name.casefold())
dst = PAGES_DIR / dst_name dst = PAGES_DIR / dst_name
try: try:
shutil.copy2(src, dst) shutil.copy2(src, dst)
canonical_pages[key] = dst_name canonical_pages[article_id] = dst_name
copied += 1 copied += 1
except Exception as e: except Exception as e:
problems.append(f"Copy failed {src}: {e}") problems.append(f"Copy failed {src}: {e}")
@ -383,6 +430,7 @@ registry = {
"canonical_pages": canonical_pages, "canonical_pages": canonical_pages,
"equivalences": equivalences, "equivalences": equivalences,
"redirects": redirects, "redirects": redirects,
"potential_tags": potential_tags,
"ignored_pages": ignored_pages, "ignored_pages": ignored_pages,
} }
@ -401,7 +449,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write(f"Redirects: {len(redirects)}\n") f.write(f"Redirects: {len(redirects)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n") f.write(f"Problems: {len(problems)}\n\n")
for p in problems[:200]: for p in problems:
f.write(p + "\n") f.write(p + "\n")
print("\n✅ PREPARATION COMPLETE") print("\n✅ PREPARATION COMPLETE")