fix link mapping

This commit is contained in:
maximator 2026-04-04 02:23:05 +02:00
parent a3f3e61e7d
commit 90dd3cc152

View file

@ -116,15 +116,12 @@ for i, path in enumerate(files, 1):
title = html.unescape(title) title = html.unescape(title)
norm = normalize_title(title) norm = normalize_title(title)
page_name = extract_wg_page_name(page_html)
full_title = normalize_title(page_name) if page_name else norm
base_title = norm base_title = norm
is_redirect = bool(IS_REDIRECT_RE.search(page_html)) is_redirect = bool(IS_REDIRECT_RE.search(page_html))
is_category = ns == "Category" or norm.startswith("category:") is_category = ns == "Category" or norm.startswith("category:")
# redirect interne
redir = extract_internal_redirect(page_html)
if redir:
redirects[normalize_title(redir)] = norm
# Categories # Categories
if ns == "Category": if ns == "Category":
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html) m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
@ -133,7 +130,6 @@ for i, path in enumerate(files, 1):
wg_title = html.unescape(m_title.group(1)) wg_title = html.unescape(m_title.group(1))
cat_base = normalize_title(wg_title) cat_base = normalize_title(wg_title)
page_name = extract_wg_page_name(page_html)
page_norm = normalize_title(page_name) if page_name else None page_norm = normalize_title(page_name) if page_name else None
if page_norm and page_norm != f"category:{cat_base}": if page_norm and page_norm != f"category:{cat_base}":
@ -147,9 +143,15 @@ for i, path in enumerate(files, 1):
base_title = norm.replace("category:", "", 1) base_title = norm.replace("category:", "", 1)
is_category = True is_category = True
# redirect interne
redir = extract_internal_redirect(page_html)
if redir:
redirects[full_title] = normalize_title(redir)
all_variants[article_id].append({ all_variants[article_id].append({
"path": path, "path": path,
"title": base_title, "title": base_title,
"full_title": full_title,
"article_id": article_id, "article_id": article_id,
"redirect": is_redirect, "redirect": is_redirect,
"is_category": is_category, "is_category": is_category,
@ -204,7 +206,7 @@ for article_id, variants in all_variants.items():
# équivalences # équivalences
for v in variants: for v in variants:
equivalences[v["title"]] = chosen["title"] equivalences[v["full_title"]] = chosen["title"]
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}") print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
@ -224,8 +226,31 @@ def resolve_redirect(key):
for k, v in list(redirects.items()): for k, v in list(redirects.items()):
equivalences[k] = resolve_redirect(v) equivalences[k] = resolve_redirect(v)
for src, dst in list(redirects.items()):
final = equivalences.get(dst, dst)
equivalences[src] = final
redirects.clear()
# --------------------------------------------------
# PASS 4 — normalisation finale des equivalences
# -------------------------------------------------- # --------------------------------------------------
# PASS 4 — copie des pages canoniques
valid_titles = {
data["title"]
for data in canonical_pages.values()
}
for k, v in list(equivalences.items()):
if v not in valid_titles:
equivalences[k] = equivalences.get(v, v)
# invariant registry
for k, v in equivalences.items():
if v not in valid_titles:
problems.append(f"Non canonical mapping: {k} -> {v}")
# --------------------------------------------------
# PASS 5 — copie des pages canoniques
# -------------------------------------------------- # --------------------------------------------------
copied = 0 copied = 0