fix link mapping
This commit is contained in:
parent
a3f3e61e7d
commit
90dd3cc152
1 changed files with 33 additions and 8 deletions
|
|
@ -116,15 +116,12 @@ for i, path in enumerate(files, 1):
|
||||||
|
|
||||||
title = html.unescape(title)
|
title = html.unescape(title)
|
||||||
norm = normalize_title(title)
|
norm = normalize_title(title)
|
||||||
|
page_name = extract_wg_page_name(page_html)
|
||||||
|
full_title = normalize_title(page_name) if page_name else norm
|
||||||
base_title = norm
|
base_title = norm
|
||||||
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
||||||
is_category = ns == "Category" or norm.startswith("category:")
|
is_category = ns == "Category" or norm.startswith("category:")
|
||||||
|
|
||||||
# redirect interne
|
|
||||||
redir = extract_internal_redirect(page_html)
|
|
||||||
if redir:
|
|
||||||
redirects[normalize_title(redir)] = norm
|
|
||||||
|
|
||||||
# Categories
|
# Categories
|
||||||
if ns == "Category":
|
if ns == "Category":
|
||||||
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
|
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
|
||||||
|
|
@ -133,7 +130,6 @@ for i, path in enumerate(files, 1):
|
||||||
wg_title = html.unescape(m_title.group(1))
|
wg_title = html.unescape(m_title.group(1))
|
||||||
cat_base = normalize_title(wg_title)
|
cat_base = normalize_title(wg_title)
|
||||||
|
|
||||||
page_name = extract_wg_page_name(page_html)
|
|
||||||
page_norm = normalize_title(page_name) if page_name else None
|
page_norm = normalize_title(page_name) if page_name else None
|
||||||
|
|
||||||
if page_norm and page_norm != f"category:{cat_base}":
|
if page_norm and page_norm != f"category:{cat_base}":
|
||||||
|
|
@ -147,9 +143,15 @@ for i, path in enumerate(files, 1):
|
||||||
base_title = norm.replace("category:", "", 1)
|
base_title = norm.replace("category:", "", 1)
|
||||||
is_category = True
|
is_category = True
|
||||||
|
|
||||||
|
# redirect interne
|
||||||
|
redir = extract_internal_redirect(page_html)
|
||||||
|
if redir:
|
||||||
|
redirects[full_title] = normalize_title(redir)
|
||||||
|
|
||||||
all_variants[article_id].append({
|
all_variants[article_id].append({
|
||||||
"path": path,
|
"path": path,
|
||||||
"title": base_title,
|
"title": base_title,
|
||||||
|
"full_title": full_title,
|
||||||
"article_id": article_id,
|
"article_id": article_id,
|
||||||
"redirect": is_redirect,
|
"redirect": is_redirect,
|
||||||
"is_category": is_category,
|
"is_category": is_category,
|
||||||
|
|
@ -204,7 +206,7 @@ for article_id, variants in all_variants.items():
|
||||||
|
|
||||||
# équivalences
|
# équivalences
|
||||||
for v in variants:
|
for v in variants:
|
||||||
equivalences[v["title"]] = chosen["title"]
|
equivalences[v["full_title"]] = chosen["title"]
|
||||||
|
|
||||||
|
|
||||||
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
|
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
|
||||||
|
|
@ -224,8 +226,31 @@ def resolve_redirect(key):
|
||||||
for k, v in list(redirects.items()):
|
for k, v in list(redirects.items()):
|
||||||
equivalences[k] = resolve_redirect(v)
|
equivalences[k] = resolve_redirect(v)
|
||||||
|
|
||||||
|
for src, dst in list(redirects.items()):
|
||||||
|
final = equivalences.get(dst, dst)
|
||||||
|
equivalences[src] = final
|
||||||
|
|
||||||
|
redirects.clear()
|
||||||
|
# --------------------------------------------------
|
||||||
|
# PASS 4 — normalisation finale des equivalences
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 4 — copie des pages canoniques
|
|
||||||
|
valid_titles = {
|
||||||
|
data["title"]
|
||||||
|
for data in canonical_pages.values()
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v in list(equivalences.items()):
|
||||||
|
if v not in valid_titles:
|
||||||
|
equivalences[k] = equivalences.get(v, v)
|
||||||
|
|
||||||
|
# invariant registry
|
||||||
|
for k, v in equivalences.items():
|
||||||
|
if v not in valid_titles:
|
||||||
|
problems.append(f"Non canonical mapping: {k} -> {v}")
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# PASS 5 — copie des pages canoniques
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
copied = 0
|
copied = 0
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue