From 90dd3cc152b0cb1eae059c446de4d07a7422abfd Mon Sep 17 00:00:00 2001 From: maximator Date: Sat, 4 Apr 2026 02:23:05 +0200 Subject: [PATCH] fix link mapping --- prepare_pages_and_registry.py | 41 ++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index b5784fb..8a619ba 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -116,15 +116,12 @@ for i, path in enumerate(files, 1): title = html.unescape(title) norm = normalize_title(title) + page_name = extract_wg_page_name(page_html) + full_title = normalize_title(page_name) if page_name else norm base_title = norm is_redirect = bool(IS_REDIRECT_RE.search(page_html)) is_category = ns == "Category" or norm.startswith("category:") - # redirect interne - redir = extract_internal_redirect(page_html) - if redir: - redirects[normalize_title(redir)] = norm - # Categories if ns == "Category": m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html) @@ -133,7 +130,6 @@ for i, path in enumerate(files, 1): wg_title = html.unescape(m_title.group(1)) cat_base = normalize_title(wg_title) - page_name = extract_wg_page_name(page_html) page_norm = normalize_title(page_name) if page_name else None if page_norm and page_norm != f"category:{cat_base}": @@ -147,9 +143,15 @@ for i, path in enumerate(files, 1): base_title = norm.replace("category:", "", 1) is_category = True + # redirect interne + redir = extract_internal_redirect(page_html) + if redir: + redirects[full_title] = normalize_title(redir) + all_variants[article_id].append({ "path": path, "title": base_title, + "full_title": full_title, "article_id": article_id, "redirect": is_redirect, "is_category": is_category, @@ -204,7 +206,7 @@ for article_id, variants in all_variants.items(): # équivalences for v in variants: - equivalences[v["title"]] = chosen["title"] + equivalences[v["full_title"]] = chosen["title"] print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}") @@ -224,8 +226,31 @@ def resolve_redirect(key): for k, v in list(redirects.items()): equivalences[k] = resolve_redirect(v) +for src, dst in list(redirects.items()): + final = equivalences.get(dst, dst) + equivalences[src] = final + +redirects.clear() + # -------------------------------------------------- +# PASS 4 — normalisation finale des equivalences # -------------------------------------------------- -# PASS 4 — copie des pages canoniques + +valid_titles = { + data["title"] + for data in canonical_pages.values() +} + +for k, v in list(equivalences.items()): + if v not in valid_titles: + equivalences[k] = equivalences.get(v, v) + +# invariant registry +for k, v in equivalences.items(): + if v not in valid_titles: + problems.append(f"Non canonical mapping: {k} -> {v}") + +# -------------------------------------------------- +# PASS 5 — copie des pages canoniques # -------------------------------------------------- copied = 0