fix link mapping
This commit is contained in:
parent
a3f3e61e7d
commit
90dd3cc152
1 changed files with 33 additions and 8 deletions
|
|
@ -116,15 +116,12 @@ for i, path in enumerate(files, 1):
|
|||
|
||||
title = html.unescape(title)
|
||||
norm = normalize_title(title)
|
||||
page_name = extract_wg_page_name(page_html)
|
||||
full_title = normalize_title(page_name) if page_name else norm
|
||||
base_title = norm
|
||||
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
||||
is_category = ns == "Category" or norm.startswith("category:")
|
||||
|
||||
# redirect interne
|
||||
redir = extract_internal_redirect(page_html)
|
||||
if redir:
|
||||
redirects[normalize_title(redir)] = norm
|
||||
|
||||
# Categories
|
||||
if ns == "Category":
|
||||
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
|
||||
|
|
@ -133,7 +130,6 @@ for i, path in enumerate(files, 1):
|
|||
wg_title = html.unescape(m_title.group(1))
|
||||
cat_base = normalize_title(wg_title)
|
||||
|
||||
page_name = extract_wg_page_name(page_html)
|
||||
page_norm = normalize_title(page_name) if page_name else None
|
||||
|
||||
if page_norm and page_norm != f"category:{cat_base}":
|
||||
|
|
@ -147,9 +143,15 @@ for i, path in enumerate(files, 1):
|
|||
base_title = norm.replace("category:", "", 1)
|
||||
is_category = True
|
||||
|
||||
# redirect interne
|
||||
redir = extract_internal_redirect(page_html)
|
||||
if redir:
|
||||
redirects[full_title] = normalize_title(redir)
|
||||
|
||||
all_variants[article_id].append({
|
||||
"path": path,
|
||||
"title": base_title,
|
||||
"full_title": full_title,
|
||||
"article_id": article_id,
|
||||
"redirect": is_redirect,
|
||||
"is_category": is_category,
|
||||
|
|
@ -204,7 +206,7 @@ for article_id, variants in all_variants.items():
|
|||
|
||||
# équivalences
|
||||
for v in variants:
|
||||
equivalences[v["title"]] = chosen["title"]
|
||||
equivalences[v["full_title"]] = chosen["title"]
|
||||
|
||||
|
||||
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
|
||||
|
|
@ -224,8 +226,31 @@ def resolve_redirect(key):
|
|||
for k, v in list(redirects.items()):
|
||||
equivalences[k] = resolve_redirect(v)
|
||||
|
||||
for src, dst in list(redirects.items()):
|
||||
final = equivalences.get(dst, dst)
|
||||
equivalences[src] = final
|
||||
|
||||
redirects.clear()
|
||||
# --------------------------------------------------
|
||||
# PASS 4 — normalisation finale des equivalences
|
||||
# --------------------------------------------------
|
||||
# PASS 4 — copie des pages canoniques
|
||||
|
||||
valid_titles = {
|
||||
data["title"]
|
||||
for data in canonical_pages.values()
|
||||
}
|
||||
|
||||
for k, v in list(equivalences.items()):
|
||||
if v not in valid_titles:
|
||||
equivalences[k] = equivalences.get(v, v)
|
||||
|
||||
# invariant registry
|
||||
for k, v in equivalences.items():
|
||||
if v not in valid_titles:
|
||||
problems.append(f"Non canonical mapping: {k} -> {v}")
|
||||
|
||||
# --------------------------------------------------
|
||||
# PASS 5 — copie des pages canoniques
|
||||
# --------------------------------------------------
|
||||
|
||||
copied = 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue