fix bad redirects in equivalences
This commit is contained in:
parent
556d6f1e03
commit
022a17221d
1 changed files with 99 additions and 36 deletions
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
SOURCE_DIR = Path("../original_index")
|
SOURCE_DIR = Path("../original_index")
|
||||||
OUTPUT_DIR = Path("../output")
|
OUTPUT_DIR = Path("../output")
|
||||||
|
|
@ -29,12 +30,32 @@ NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
|
||||||
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
|
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
|
||||||
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
|
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
|
||||||
|
|
||||||
|
UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
|
||||||
|
|
||||||
|
def decode_mediawiki_string(s: str) -> str:
|
||||||
|
if not s:
|
||||||
|
return s
|
||||||
|
|
||||||
|
# 1 — HTML entities
|
||||||
|
s = html.unescape(s)
|
||||||
|
|
||||||
|
# 2 — decode ONLY \uXXXX sequences (safe)
|
||||||
|
def repl(m):
|
||||||
|
return chr(int(m.group(1), 16))
|
||||||
|
|
||||||
|
s = UNICODE_ESCAPE_RE.sub(repl, s)
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def similarity(a, b):
|
def similarity(a, b):
|
||||||
return SequenceMatcher(None, a, b).ratio()
|
return SequenceMatcher(None, a, b).ratio()
|
||||||
|
|
||||||
def normalize_title(title: str) -> str:
|
def normalize_title(title: str) -> str:
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
|
title = unicodedata.normalize("NFKC", title)
|
||||||
title = title.replace("_", " ")
|
title = title.replace("_", " ")
|
||||||
|
title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
||||||
title = re.sub(r"\s+", " ", title)
|
title = re.sub(r"\s+", " ", title)
|
||||||
return title.casefold()
|
return title.casefold()
|
||||||
|
|
||||||
|
|
@ -47,7 +68,7 @@ def sanitize_filename(name: str) -> str:
|
||||||
def extract_wg_page_name(page_html: str) -> str | None:
|
def extract_wg_page_name(page_html: str) -> str | None:
|
||||||
m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html)
|
m = re.search(r'"wgPageName"\s*:\s*"([^"]+)"', page_html)
|
||||||
if m:
|
if m:
|
||||||
return html.unescape(m.group(1)).replace("_", " ")
|
return decode_mediawiki_string(m.group(1)).replace("_", " ")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -57,7 +78,7 @@ def extract_page_identity(page_html: str):
|
||||||
return page
|
return page
|
||||||
m = re.search(r"<title>(.*?) -", page_html, re.I)
|
m = re.search(r"<title>(.*?) -", page_html, re.I)
|
||||||
if m:
|
if m:
|
||||||
return html.unescape(m.group(1))
|
return decode_mediawiki_string(m.group(1))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -73,7 +94,7 @@ def extract_article_id(page_html: str) -> int | None:
|
||||||
def extract_internal_redirect(page_html: str):
|
def extract_internal_redirect(page_html: str):
|
||||||
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
|
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
|
||||||
if m:
|
if m:
|
||||||
return html.unescape(m.group(1)).replace("_", " ")
|
return decode_mediawiki_string(m.group(1)).replace("_", " ")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -87,7 +108,7 @@ def extract_namespace(page_html: str) -> str:
|
||||||
def extract_wg_title(page_html):
|
def extract_wg_title(page_html):
|
||||||
m = WG_TITLE_RE.search(page_html)
|
m = WG_TITLE_RE.search(page_html)
|
||||||
if m:
|
if m:
|
||||||
return html.unescape(m.group(1))
|
return decode_mediawiki_string(m.group(1))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -97,14 +118,16 @@ def normalize_reference_key(key: str) -> str:
|
||||||
# normalise namespace category
|
# normalise namespace category
|
||||||
key = re.sub(r"^category[\s:_]+", "", key)
|
key = re.sub(r"^category[\s:_]+", "", key)
|
||||||
|
|
||||||
|
# normalise les apostrophes typographiques → ascii
|
||||||
|
key = key.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
||||||
|
|
||||||
|
|
||||||
# collapse espaces
|
# collapse espaces
|
||||||
key = re.sub(r"\s+", " ", key)
|
key = re.sub(r"\s+", " ", key)
|
||||||
|
|
||||||
return key.strip()
|
return key.strip()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def has_editorial_content(html_page: str) -> bool:
|
def has_editorial_content(html_page: str) -> bool:
|
||||||
soup = BeautifulSoup(html_page, "html.parser")
|
soup = BeautifulSoup(html_page, "html.parser")
|
||||||
|
|
||||||
|
|
@ -129,6 +152,7 @@ def has_editorial_content(html_page: str) -> bool:
|
||||||
return len(editorial_text) > 200
|
return len(editorial_text) > 200
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# Registry structures
|
# Registry structures
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
@ -145,9 +169,11 @@ print(f"{len(files)} fichiers trouvés")
|
||||||
# PASS 1 — analyse et collecte des variantes
|
# PASS 1 — analyse et collecte des variantes
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
category_redirects = {}
|
||||||
|
|
||||||
for i, path in enumerate(files, 1):
|
for i, path in enumerate(files, 1):
|
||||||
try:
|
try:
|
||||||
page_html = path.read_text(encoding="utf-8", errors="ignore")
|
page_html = path.read_text(encoding="utf-8", errors="replace")
|
||||||
|
|
||||||
article_id = extract_article_id(page_html)
|
article_id = extract_article_id(page_html)
|
||||||
if not article_id:
|
if not article_id:
|
||||||
|
|
@ -166,7 +192,7 @@ for i, path in enumerate(files, 1):
|
||||||
ignored_pages.append(path.name)
|
ignored_pages.append(path.name)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
title = html.unescape(title)
|
title = decode_mediawiki_string(title)
|
||||||
norm = normalize_title(title)
|
norm = normalize_title(title)
|
||||||
page_name = extract_wg_page_name(page_html)
|
page_name = extract_wg_page_name(page_html)
|
||||||
full_title = normalize_title(page_name) if page_name else norm
|
full_title = normalize_title(page_name) if page_name else norm
|
||||||
|
|
@ -182,7 +208,7 @@ for i, path in enumerate(files, 1):
|
||||||
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
|
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
|
||||||
|
|
||||||
if m_title:
|
if m_title:
|
||||||
wg_title = html.unescape(m_title.group(1))
|
wg_title = decode_mediawiki_string(m_title.group(1))
|
||||||
cat_base = normalize_title(wg_title)
|
cat_base = normalize_title(wg_title)
|
||||||
|
|
||||||
page_norm = normalize_title(page_name) if page_name else None
|
page_norm = normalize_title(page_name) if page_name else None
|
||||||
|
|
@ -201,7 +227,12 @@ for i, path in enumerate(files, 1):
|
||||||
# redirect interne
|
# redirect interne
|
||||||
redir = extract_internal_redirect(page_html)
|
redir = extract_internal_redirect(page_html)
|
||||||
if redir:
|
if redir:
|
||||||
redirects[full_title] = normalize_title(redir)
|
key = full_title
|
||||||
|
target = normalize_title(redir)
|
||||||
|
if is_listing_only or is_category:
|
||||||
|
category_redirects[key] = target
|
||||||
|
else:
|
||||||
|
redirects[key] = target
|
||||||
|
|
||||||
canonical_key = normalize_reference_key(full_title)
|
canonical_key = normalize_reference_key(full_title)
|
||||||
all_variants[article_id].append({
|
all_variants[article_id].append({
|
||||||
|
|
@ -221,6 +252,7 @@ for i, path in enumerate(files, 1):
|
||||||
if i % 200 == 0:
|
if i % 200 == 0:
|
||||||
print(f"{i}/{len(files)} analysés")
|
print(f"{i}/{len(files)} analysés")
|
||||||
print("Variants collected:", len(all_variants))
|
print("Variants collected:", len(all_variants))
|
||||||
|
print("Added category_redirect from category/listing:", len(category_redirects))
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 2 — choix des versions canoniques
|
# PASS 2 — choix des versions canoniques
|
||||||
|
|
@ -284,8 +316,9 @@ def variant_score(v):
|
||||||
def add_equivalence(k, v):
|
def add_equivalence(k, v):
|
||||||
k = normalize_reference_key(k)
|
k = normalize_reference_key(k)
|
||||||
v = normalize_reference_key(v)
|
v = normalize_reference_key(v)
|
||||||
|
|
||||||
if k != v:
|
if k != v:
|
||||||
|
if v not in [d["title"] for d in canonical_pages.values()]:
|
||||||
|
print("⚠️ Adding equivalence to NON-CANONICAL value:", k, "->", v)
|
||||||
equivalences[k] = v
|
equivalences[k] = v
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -310,9 +343,6 @@ for article_id, variants in all_variants.items():
|
||||||
"redirect": chosen["redirect"],
|
"redirect": chosen["redirect"],
|
||||||
}
|
}
|
||||||
|
|
||||||
if chosen["wg_title"]:
|
|
||||||
add_equivalence(chosen["wg_title"], canonical_slug)
|
|
||||||
|
|
||||||
for v in variants:
|
for v in variants:
|
||||||
if v["is_category"] and not v["is_listing_only"]:
|
if v["is_category"] and not v["is_listing_only"]:
|
||||||
# catégorie non choisie
|
# catégorie non choisie
|
||||||
|
|
@ -342,14 +372,59 @@ def resolve_redirect(key):
|
||||||
key = redirects[key]
|
key = redirects[key]
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_all(key):
|
||||||
|
seen = set()
|
||||||
|
while key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
|
||||||
|
if key in redirects:
|
||||||
|
key = redirects[key]
|
||||||
|
continue
|
||||||
|
|
||||||
|
if key in equivalences:
|
||||||
|
key = equivalences[key]
|
||||||
|
continue
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
|
skipped_redirect = 0
|
||||||
|
ignored_redirect = 0
|
||||||
|
|
||||||
|
valid_titles = {
|
||||||
|
data["title"]
|
||||||
|
for data in canonical_pages.values()
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v in category_redirects.items():
|
||||||
|
if k == v:
|
||||||
|
continue
|
||||||
|
final = resolve_all(v)
|
||||||
|
if final in valid_titles and k != final:
|
||||||
|
equivalences[k] = final
|
||||||
|
|
||||||
for k, v in list(redirects.items()):
|
for k, v in list(redirects.items()):
|
||||||
equivalences[k] = resolve_redirect(v)
|
if k == v:
|
||||||
|
continue
|
||||||
|
final = resolve_all(v)
|
||||||
|
if final in valid_titles and k != final:
|
||||||
|
equivalences[k] = final
|
||||||
|
else:
|
||||||
|
skipped_redirect += 1
|
||||||
|
|
||||||
for src, dst in list(redirects.items()):
|
for src, dst in list(redirects.items()):
|
||||||
final = equivalences.get(dst, dst)
|
final = equivalences.get(dst, dst)
|
||||||
|
if final in valid_titles and src != final:
|
||||||
equivalences[src] = final
|
equivalences[src] = final
|
||||||
|
else:
|
||||||
|
ignored_redirect += 1
|
||||||
|
|
||||||
|
print(f"Skipped redirect to non-canonical: {skipped_redirect}")
|
||||||
|
print(f"Ignored redirect (non-canonical): {ignored_redirect}")
|
||||||
|
|
||||||
redirects.clear()
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 4 — normalisation finale des equivalences
|
# PASS 4 — normalisation finale des equivalences
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
@ -361,24 +436,13 @@ def resolve_equivalence(key):
|
||||||
key = equivalences[key]
|
key = equivalences[key]
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
for k in list(equivalences):
|
for k in list(equivalences):
|
||||||
equivalences[k] = resolve_equivalence(equivalences[k])
|
final = resolve_equivalence(equivalences[k])
|
||||||
|
if final in valid_titles:
|
||||||
|
equivalences[k] = final
|
||||||
|
|
||||||
valid_titles = {
|
|
||||||
data["title"]
|
|
||||||
for data in canonical_pages.values()
|
|
||||||
}
|
|
||||||
|
|
||||||
for k, v in list(equivalences.items()):
|
|
||||||
if v not in valid_titles:
|
|
||||||
equivalences[k] = equivalences.get(v, v)
|
|
||||||
# category:* ou category_* comme clés
|
|
||||||
for k, v in list(equivalences.items()):
|
|
||||||
new_k = re.sub(r"^category[\s:_]+", "category ", k)
|
|
||||||
if new_k != k:
|
|
||||||
equivalences[new_k] = v
|
|
||||||
del equivalences[k]
|
|
||||||
# invariant registry
|
|
||||||
for k, v in equivalences.items():
|
for k, v in equivalences.items():
|
||||||
if v not in valid_titles:
|
if v not in valid_titles:
|
||||||
problems.append(f"Non canonical mapping: {k} -> {v}")
|
problems.append(f"Non canonical mapping: {k} -> {v}")
|
||||||
|
|
@ -387,17 +451,16 @@ equivalences = {
|
||||||
k: v for k, v in equivalences.items()
|
k: v for k, v in equivalences.items()
|
||||||
if k != v
|
if k != v
|
||||||
}
|
}
|
||||||
|
print(f"Equivalences kept: {len(equivalences)}")
|
||||||
|
|
||||||
|
|
||||||
for k in list(equivalences):
|
|
||||||
equivalences[k] = resolve_equivalence(equivalences[k])
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 5 — copie des pages canoniques
|
# PASS 5 — copie des pages canoniques
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def title_to_filename(title: str) -> str:
|
def title_to_filename(title: str) -> str:
|
||||||
return sanitize_filename(
|
return sanitize_filename(
|
||||||
title.replace(" ", "_").casefold() + ".html"
|
title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue