fix page dedup

This commit is contained in:
maximator 2026-04-04 01:32:39 +02:00
parent c7b45432b1
commit a3f3e61e7d

View file

@ -48,12 +48,9 @@ def extract_page_identity(html: str):
page = extract_wg_page_name(html) page = extract_wg_page_name(html)
if page: if page:
return page return page
# fallback title tag
m = re.search(r"<title>(.*?) -", html, re.I) m = re.search(r"<title>(.*?) -", html, re.I)
if m: if m:
return html.unescape(m.group(1)) return html.unescape(m.group(1))
return None return None
@ -66,18 +63,6 @@ def extract_article_id(html: str) -> int | None:
return None return None
def extract_redirect(html: str) -> str | None:
m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I)
if m:
return m.group(1).strip()
return None
def namespace_of(title: str):
if ":" in title:
return title.split(":", 1)[0]
def extract_internal_redirect(page_html: str): def extract_internal_redirect(page_html: str):
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html) m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
if m: if m:
@ -91,26 +76,24 @@ def extract_namespace(html: str) -> str:
return m.group(1) return m.group(1)
return "" return ""
# -------------------------------------------------- # --------------------------------------------------
# Registry structures # Registry structures
# -------------------------------------------------- # --------------------------------------------------
canonical_pages = {}
equivalences = {}
redirects = {}
ignored_pages = [] ignored_pages = []
problems = [] problems = []
redirects = {}
all_variants = defaultdict(list)
files = list(SOURCE_DIR.glob("*.html")) files = list(SOURCE_DIR.glob("*.html"))
print(f"{len(files)} fichiers trouvés") print(f"{len(files)} fichiers trouvés")
# -------------------------------------------------- # --------------------------------------------------
# PASS 1 — analyse # PASS 1 — analyse et collecte des variantes
# -------------------------------------------------- # --------------------------------------------------
for i, path in enumerate(files, 1): for i, path in enumerate(files, 1):
try: try:
page_html = path.read_text(encoding="utf-8", errors="ignore") page_html = path.read_text(encoding="utf-8", errors="ignore")
@ -126,50 +109,51 @@ for i, path in enumerate(files, 1):
ns = extract_namespace(page_html) ns = extract_namespace(page_html)
# ------------------------- # Ignorer certains namespaces
# Ignore namespaces
# -------------------------
if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"): if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
ignored_pages.append(path.name) ignored_pages.append(path.name)
continue continue
title = html.unescape(title) title = html.unescape(title)
norm = normalize_title(title) norm = normalize_title(title)
base_title = norm
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
is_category = ns == "Category" or norm.startswith("category:")
# redirect interne
# -------------------------
# Category pages
# -------------------------
# Category pages CAN be canonical content
if ns == "Category":
norm = normalize_title(title)
equivalences[norm] = norm
# -------------------------
# Redirect detection
# -------------------------
redir = extract_internal_redirect(page_html) redir = extract_internal_redirect(page_html)
if redir: if redir:
redirects[normalize_title(redir)] = norm redirects[normalize_title(redir)] = norm
# -------------------------
# Canonical article
# -------------------------
is_redirect = bool(IS_REDIRECT_RE.search(page_html)) # Categories
if article_id not in canonical_pages: if ns == "Category":
canonical_pages[article_id] = { m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
"path": path,
"title": norm, if m_title:
"redirect": is_redirect, wg_title = html.unescape(m_title.group(1))
} cat_base = normalize_title(wg_title)
elif canonical_pages[article_id]["redirect"] and not is_redirect:
canonical_pages[article_id] = { page_name = extract_wg_page_name(page_html)
"path": path, page_norm = normalize_title(page_name) if page_name else None
"title": norm,
"redirect": is_redirect, if page_norm and page_norm != f"category:{cat_base}":
} # page réelle déguisée en category
# self equivalence base_title = page_norm
equivalences[norm] = norm is_category = False
else:
base_title = cat_base
is_category = True
else:
base_title = norm.replace("category:", "", 1)
is_category = True
all_variants[article_id].append({
"path": path,
"title": base_title,
"article_id": article_id,
"redirect": is_redirect,
"is_category": is_category,
})
except Exception as e: except Exception as e:
problems.append(f"{path}: {e}") problems.append(f"{path}: {e}")
@ -177,9 +161,57 @@ for i, path in enumerate(files, 1):
if i % 200 == 0: if i % 200 == 0:
print(f"{i}/{len(files)} analysés") print(f"{i}/{len(files)} analysés")
# --------------------------------------------------
# PASS 2 — choix des versions canoniques
# --------------------------------------------------
canonical_pages = {}
equivalences = {}
category_replaced = 0
nb_all_cat = 0
def variant_score(v):
"""
Plus le score est petit meilleur candidat.
"""
return (
v["is_category"], # False (0) meilleur que True (1)
v["redirect"], # False meilleur
"category:" in v["path"].name.lower(), # sécurité filename
len(v["path"].name), # stabilité
)
for article_id, variants in all_variants.items():
# tri déterministe
variants_sorted = sorted(variants, key=variant_score)
chosen = variants_sorted[0]
if all(v["is_category"] for v in variants):
nb_all_cat += 1
if chosen["is_category"]:
category_replaced += 1
canonical_pages[article_id] = {
"path": chosen["path"],
"title": chosen["title"],
"redirect": chosen["redirect"],
}
# équivalences
for v in variants:
equivalences[v["title"]] = chosen["title"]
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
print(f"{category_replaced} 'category_*' remplacées par leur version de base")
# -------------------------------------------------- # --------------------------------------------------
# PASS 2 — resolve redirects # PASS 3 — resolve redirects
# -------------------------------------------------- # --------------------------------------------------
def resolve_redirect(key): def resolve_redirect(key):
@ -189,24 +221,18 @@ def resolve_redirect(key):
key = redirects[key] key = redirects[key]
return key return key
for k, v in list(redirects.items()): for k, v in list(redirects.items()):
equivalences[k] = resolve_redirect(v) equivalences[k] = resolve_redirect(v)
# -------------------------------------------------- # --------------------------------------------------
# PASS 3 — copy canonical pages # PASS 4 — copie des pages canoniques
# -------------------------------------------------- # --------------------------------------------------
copied = 0 copied = 0
# for key, src in canonical_pages.items():
for key, data in canonical_pages.items(): for key, data in canonical_pages.items():
src = data["path"] src = data["path"]
dst_name = sanitize_filename(src.name) dst_name = sanitize_filename(src.name)
dst = PAGES_DIR / dst_name dst = PAGES_DIR / dst_name
try: try:
shutil.copy2(src, dst) shutil.copy2(src, dst)
canonical_pages[key] = dst_name canonical_pages[key] = dst_name
@ -216,7 +242,6 @@ for key, data in canonical_pages.items():
print(f"{copied} pages copiées") print(f"{copied} pages copiées")
# -------------------------------------------------- # --------------------------------------------------
# SAVE REGISTRY # SAVE REGISTRY
# -------------------------------------------------- # --------------------------------------------------
@ -229,11 +254,9 @@ registry = {
} }
REGISTRY_PATH.parent.mkdir(exist_ok=True) REGISTRY_PATH.parent.mkdir(exist_ok=True)
with open(REGISTRY_PATH, "w", encoding="utf-8") as f: with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
json.dump(registry, f, indent=2, ensure_ascii=False) json.dump(registry, f, indent=2, ensure_ascii=False)
# -------------------------------------------------- # --------------------------------------------------
# REPORT # REPORT
# -------------------------------------------------- # --------------------------------------------------
@ -245,7 +268,6 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write(f"Redirects: {len(redirects)}\n") f.write(f"Redirects: {len(redirects)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n") f.write(f"Problems: {len(problems)}\n\n")
for p in problems[:200]: for p in problems[:200]:
f.write(p + "\n") f.write(p + "\n")