fix maaping canonical preference
This commit is contained in:
parent
90dd3cc152
commit
7f019ed98c
2 changed files with 242 additions and 72 deletions
|
|
@ -5,6 +5,7 @@ import shutil
|
||||||
import html
|
import html
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
SOURCE_DIR = Path("../original_index")
|
SOURCE_DIR = Path("../original_index")
|
||||||
OUTPUT_DIR = Path("../output")
|
OUTPUT_DIR = Path("../output")
|
||||||
|
|
@ -24,6 +25,11 @@ INVALID_WIN_CHARS = r'[<>:"/\\|?*]'
|
||||||
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
|
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
|
||||||
IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
|
IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
|
||||||
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
|
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
|
||||||
|
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
|
||||||
|
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
|
||||||
|
|
||||||
|
def similarity(a, b):
|
||||||
|
return SequenceMatcher(None, a, b).ratio()
|
||||||
|
|
||||||
def normalize_title(title: str) -> str:
|
def normalize_title(title: str) -> str:
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
|
|
@ -77,6 +83,25 @@ def extract_namespace(html: str) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_wg_title(page_html):
|
||||||
|
m = WG_TITLE_RE.search(page_html)
|
||||||
|
if m:
|
||||||
|
return html.unescape(m.group(1))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_reference_key(key: str) -> str:
|
||||||
|
key = normalize_title(key)
|
||||||
|
|
||||||
|
# normalise namespace category
|
||||||
|
key = re.sub(r"^category[\s:_]+", "", key)
|
||||||
|
|
||||||
|
# collapse espaces
|
||||||
|
key = re.sub(r"\s+", " ", key)
|
||||||
|
|
||||||
|
return key.strip()
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# Registry structures
|
# Registry structures
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
@ -121,6 +146,7 @@ for i, path in enumerate(files, 1):
|
||||||
base_title = norm
|
base_title = norm
|
||||||
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
||||||
is_category = ns == "Category" or norm.startswith("category:")
|
is_category = ns == "Category" or norm.startswith("category:")
|
||||||
|
wg_title = extract_wg_title(page_html)
|
||||||
|
|
||||||
# Categories
|
# Categories
|
||||||
if ns == "Category":
|
if ns == "Category":
|
||||||
|
|
@ -148,11 +174,13 @@ for i, path in enumerate(files, 1):
|
||||||
if redir:
|
if redir:
|
||||||
redirects[full_title] = normalize_title(redir)
|
redirects[full_title] = normalize_title(redir)
|
||||||
|
|
||||||
|
canonical_key = normalize_reference_key(full_title)
|
||||||
all_variants[article_id].append({
|
all_variants[article_id].append({
|
||||||
"path": path,
|
"path": path,
|
||||||
"title": base_title,
|
"title": base_title,
|
||||||
"full_title": full_title,
|
"canonical_key": full_title,
|
||||||
"article_id": article_id,
|
"article_id": article_id,
|
||||||
|
"wg_title": normalize_title(wg_title) if wg_title else None,
|
||||||
"redirect": is_redirect,
|
"redirect": is_redirect,
|
||||||
"is_category": is_category,
|
"is_category": is_category,
|
||||||
})
|
})
|
||||||
|
|
@ -162,6 +190,7 @@ for i, path in enumerate(files, 1):
|
||||||
|
|
||||||
if i % 200 == 0:
|
if i % 200 == 0:
|
||||||
print(f"{i}/{len(files)} analysés")
|
print(f"{i}/{len(files)} analysés")
|
||||||
|
print("Variants collected:", len(all_variants))
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 2 — choix des versions canoniques
|
# PASS 2 — choix des versions canoniques
|
||||||
|
|
@ -173,15 +202,52 @@ category_replaced = 0
|
||||||
nb_all_cat = 0
|
nb_all_cat = 0
|
||||||
|
|
||||||
|
|
||||||
|
def slug_to_title(filename: str) -> str:
|
||||||
|
name = Path(filename).stem
|
||||||
|
name = re.sub(r"\d+$", "", name)
|
||||||
|
return normalize_title(name)
|
||||||
|
|
||||||
|
|
||||||
|
def filename_similarity_score(filename, wg_title):
|
||||||
|
if not wg_title:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
filename = normalize_title(filename)
|
||||||
|
wg_title = normalize_title(wg_title)
|
||||||
|
|
||||||
|
# enlève chiffres suffixes
|
||||||
|
filename = re.sub(r"\d+$", "", filename)
|
||||||
|
|
||||||
|
return similarity(filename, wg_title)
|
||||||
|
|
||||||
def variant_score(v):
|
def variant_score(v):
|
||||||
"""
|
|
||||||
Plus le score est petit → meilleur candidat.
|
filename = v["path"].stem
|
||||||
"""
|
filename_norm = normalize_title(filename)
|
||||||
|
|
||||||
|
similarity_score = filename_similarity_score(
|
||||||
|
filename_norm,
|
||||||
|
v["wg_title"]
|
||||||
|
)
|
||||||
|
|
||||||
|
is_short_slug = bool(
|
||||||
|
SHORT_SLUG_RE.match(filename_norm.replace(" ", ""))
|
||||||
|
)
|
||||||
|
|
||||||
|
long_title_penalty = (
|
||||||
|
"," in filename or
|
||||||
|
"_" in filename or
|
||||||
|
len(filename) > 40
|
||||||
|
)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
v["is_category"], # False (0) meilleur que True (1)
|
v["is_category"],
|
||||||
v["redirect"], # False meilleur
|
v["redirect"],
|
||||||
"category:" in v["path"].name.lower(), # sécurité filename
|
not is_short_slug,
|
||||||
len(v["path"].name), # stabilité
|
long_title_penalty,
|
||||||
|
-similarity_score,
|
||||||
|
len(filename),
|
||||||
|
filename.lower(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -189,6 +255,7 @@ for article_id, variants in all_variants.items():
|
||||||
|
|
||||||
# tri déterministe
|
# tri déterministe
|
||||||
variants_sorted = sorted(variants, key=variant_score)
|
variants_sorted = sorted(variants, key=variant_score)
|
||||||
|
print(f"variants_sorted: {variants_sorted}")
|
||||||
|
|
||||||
chosen = variants_sorted[0]
|
chosen = variants_sorted[0]
|
||||||
|
|
||||||
|
|
@ -198,16 +265,34 @@ for article_id, variants in all_variants.items():
|
||||||
if chosen["is_category"]:
|
if chosen["is_category"]:
|
||||||
category_replaced += 1
|
category_replaced += 1
|
||||||
|
|
||||||
|
canonical_title = normalize_reference_key(chosen["title"])
|
||||||
|
|
||||||
canonical_pages[article_id] = {
|
canonical_pages[article_id] = {
|
||||||
"path": chosen["path"],
|
"path": chosen["path"],
|
||||||
"title": chosen["title"],
|
"title": canonical_title,
|
||||||
"redirect": chosen["redirect"],
|
"redirect": chosen["redirect"],
|
||||||
}
|
}
|
||||||
|
|
||||||
# équivalences
|
# équivalences
|
||||||
for v in variants:
|
for v in variants:
|
||||||
equivalences[v["full_title"]] = chosen["title"]
|
equivalences[v["canonical_key"]] = chosen["title"]
|
||||||
|
|
||||||
|
equivalences.clear()
|
||||||
|
|
||||||
|
def add_equivalence(k, v):
|
||||||
|
k = normalize_reference_key(k)
|
||||||
|
v = normalize_reference_key(v)
|
||||||
|
|
||||||
|
if k != v:
|
||||||
|
equivalences[k] = v
|
||||||
|
|
||||||
|
for article_id, variants in all_variants.items():
|
||||||
|
canonical_title = canonical_pages[article_id]["title"]
|
||||||
|
canonical_slug = Path(canonical_pages[article_id]["path"]).stem
|
||||||
|
for v in variants:
|
||||||
|
add_equivalence(v["canonical_key"], canonical_slug)
|
||||||
|
filename_key = normalize_title(Path(v["path"]).stem)
|
||||||
|
add_equivalence(filename_key, canonical_slug)
|
||||||
|
|
||||||
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
|
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
|
||||||
print(f"{category_replaced} 'category_*' remplacées par leur version de base")
|
print(f"{category_replaced} 'category_*' remplacées par leur version de base")
|
||||||
|
|
@ -231,7 +316,7 @@ for src, dst in list(redirects.items()):
|
||||||
equivalences[src] = final
|
equivalences[src] = final
|
||||||
|
|
||||||
redirects.clear()
|
redirects.clear()
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 4 — normalisation finale des equivalences
|
# PASS 4 — normalisation finale des equivalences
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
|
@ -243,21 +328,41 @@ valid_titles = {
|
||||||
for k, v in list(equivalences.items()):
|
for k, v in list(equivalences.items()):
|
||||||
if v not in valid_titles:
|
if v not in valid_titles:
|
||||||
equivalences[k] = equivalences.get(v, v)
|
equivalences[k] = equivalences.get(v, v)
|
||||||
|
# category:* ou category_* comme clés
|
||||||
|
for k, v in list(equivalences.items()):
|
||||||
|
new_k = re.sub(r"^category[\s:_]+", "category ", k)
|
||||||
|
if new_k != k:
|
||||||
|
equivalences[new_k] = v
|
||||||
|
del equivalences[k]
|
||||||
# invariant registry
|
# invariant registry
|
||||||
for k, v in equivalences.items():
|
for k, v in equivalences.items():
|
||||||
if v not in valid_titles:
|
if v not in valid_titles:
|
||||||
problems.append(f"Non canonical mapping: {k} -> {v}")
|
problems.append(f"Non canonical mapping: {k} -> {v}")
|
||||||
|
equivalences = {
|
||||||
|
k: v for k, v in equivalences.items()
|
||||||
|
if k != v
|
||||||
|
}
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 5 — copie des pages canoniques
|
# PASS 5 — copie des pages canoniques
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def title_to_filename(title: str) -> str:
|
||||||
|
return sanitize_filename(
|
||||||
|
title.replace(" ", "_").casefold() + ".html"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
copied = 0
|
copied = 0
|
||||||
for key, data in canonical_pages.items():
|
total = len(canonical_pages)
|
||||||
|
|
||||||
|
for i, (key, data) in enumerate(canonical_pages.items(), 1):
|
||||||
|
|
||||||
src = data["path"]
|
src = data["path"]
|
||||||
dst_name = sanitize_filename(src.name)
|
|
||||||
|
dst_name = sanitize_filename(src.name.casefold())
|
||||||
dst = PAGES_DIR / dst_name
|
dst = PAGES_DIR / dst_name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
shutil.copy2(src, dst)
|
shutil.copy2(src, dst)
|
||||||
canonical_pages[key] = dst_name
|
canonical_pages[key] = dst_name
|
||||||
|
|
@ -265,6 +370,9 @@ for key, data in canonical_pages.items():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
problems.append(f"Copy failed {src}: {e}")
|
problems.append(f"Copy failed {src}: {e}")
|
||||||
|
|
||||||
|
if i % 200 == 0 or i == total:
|
||||||
|
print(f"{i}/{total} copiés")
|
||||||
|
|
||||||
print(f"{copied} pages copiées")
|
print(f"{copied} pages copiées")
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -1,48 +1,100 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse, parse_qs, unquote
|
||||||
|
|
||||||
INPUT_DIR = Path("../unique_pages")
|
# --------------------------------------------------
|
||||||
REGISTRY_DIR = Path("../link_registry")
|
# PATHS
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
|
PAGES_DIR = Path("../output/pages")
|
||||||
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
|
REGISTRY_PATH = Path("../output/equivalence_registry.json")
|
||||||
|
OUTPUT_DIR = Path("../output/link_scan")
|
||||||
|
|
||||||
OUTPUT_RESOLVED = []
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||||
OUTPUT_UNRESOLVED = []
|
|
||||||
|
|
||||||
# ======================
|
# --------------------------------------------------
|
||||||
|
# LOAD REGISTRY
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
|
||||||
|
|
||||||
|
equivalences = registry["equivalences"]
|
||||||
|
canonical_pages = registry["canonical_pages"]
|
||||||
|
|
||||||
|
valid_targets = set(canonical_pages.values())
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
# HELPERS
|
# HELPERS
|
||||||
# ======================
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
def normalize_title(title: str | None):
|
||||||
|
if not title:
|
||||||
|
return None
|
||||||
|
|
||||||
|
title = unquote(title)
|
||||||
|
title = title.replace("_", " ")
|
||||||
|
title = re.sub(r"\s+", " ", title.strip())
|
||||||
|
return title.casefold()
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Extract MediaWiki target
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
|
def extract_mediawiki_target(href: str):
|
||||||
|
|
||||||
def normalize_href(href: str):
|
|
||||||
if not href:
|
if not href:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# ignore external links
|
# ignore anchors
|
||||||
if href.startswith("http"):
|
if href.startswith("#"):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
name = Path(href).stem
|
parsed = urlparse(href)
|
||||||
return name.lower()
|
|
||||||
|
# external link
|
||||||
|
if parsed.scheme in ("http", "https"):
|
||||||
|
return None
|
||||||
|
|
||||||
|
path = parsed.path or ""
|
||||||
|
|
||||||
|
# /wiki/Page_Name
|
||||||
|
if "/wiki/" in path:
|
||||||
|
return path.split("/wiki/", 1)[1]
|
||||||
|
|
||||||
|
# index.php?title=Page
|
||||||
|
if "index.php" in path:
|
||||||
|
qs = parse_qs(parsed.query)
|
||||||
|
if "title" in qs:
|
||||||
|
return qs["title"][0]
|
||||||
|
|
||||||
|
# fallback filename-like
|
||||||
|
return Path(path).stem
|
||||||
|
|
||||||
|
|
||||||
def resolve(name):
|
# -------------------------
|
||||||
if name in title_registry:
|
# Ignore unwanted namespaces
|
||||||
return name
|
# -------------------------
|
||||||
|
|
||||||
if name in alias_registry:
|
IGNORED_PREFIXES = (
|
||||||
return alias_registry[name]
|
"file:",
|
||||||
|
"image:",
|
||||||
|
"template:",
|
||||||
|
"special:",
|
||||||
|
"help:",
|
||||||
|
"user:",
|
||||||
|
"talk:",
|
||||||
|
)
|
||||||
|
|
||||||
# try removing category prefix
|
def is_ignored_namespace(title_norm: str):
|
||||||
if name.startswith("category_"):
|
return title_norm.startswith(IGNORED_PREFIXES)
|
||||||
alt = name.replace("category_", "", 1)
|
|
||||||
if alt in title_registry:
|
|
||||||
return alt
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Extract article content
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
def extract_article_links(soup):
|
def extract_article_links(soup):
|
||||||
|
|
||||||
|
|
@ -52,33 +104,26 @@ def extract_article_links(soup):
|
||||||
|
|
||||||
links = []
|
links = []
|
||||||
|
|
||||||
for a in content.find_all("a", href=True):
|
for a in content.select("a[href]"):
|
||||||
|
|
||||||
href = a["href"]
|
# ignore navboxes / metadata
|
||||||
|
if a.find_parent(class_="navbox"):
|
||||||
# ignore anchors
|
|
||||||
if href.startswith("#"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# ignore files/images/history/etc
|
|
||||||
if any(prefix in href.lower() for prefix in [
|
|
||||||
"file_",
|
|
||||||
"image:",
|
|
||||||
"special:",
|
|
||||||
"action=",
|
|
||||||
]):
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
href = a.get("href")
|
||||||
links.append(href)
|
links.append(href)
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
# ======================
|
# --------------------------------------------------
|
||||||
# MAIN
|
# MAIN SCAN
|
||||||
# ======================
|
# --------------------------------------------------
|
||||||
|
|
||||||
files = list(INPUT_DIR.glob("*.html"))
|
resolved_links = []
|
||||||
|
unresolved_links = []
|
||||||
|
|
||||||
|
files = list(PAGES_DIR.glob("*.html"))
|
||||||
print(f"{len(files)} pages à analyser")
|
print(f"{len(files)} pages à analyser")
|
||||||
|
|
||||||
for i, file_path in enumerate(files, 1):
|
for i, file_path in enumerate(files, 1):
|
||||||
|
|
@ -90,33 +135,50 @@ for i, file_path in enumerate(files, 1):
|
||||||
|
|
||||||
for href in links:
|
for href in links:
|
||||||
|
|
||||||
key = normalize_href(href)
|
raw_target = extract_mediawiki_target(href)
|
||||||
if not key:
|
norm = normalize_title(raw_target)
|
||||||
|
|
||||||
|
if not norm:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
resolved = resolve(key)
|
if is_ignored_namespace(norm):
|
||||||
|
continue
|
||||||
|
|
||||||
entry = {
|
entry = {
|
||||||
"source": file_path.name,
|
"source": file_path.name,
|
||||||
"link": href,
|
"href": href,
|
||||||
|
"normalized": norm,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resolved = equivalences.get(norm)
|
||||||
|
|
||||||
if resolved:
|
if resolved:
|
||||||
entry["target"] = resolved
|
entry["resolved_title"] = resolved
|
||||||
OUTPUT_RESOLVED.append(entry)
|
resolved_links.append(entry)
|
||||||
else:
|
else:
|
||||||
OUTPUT_UNRESOLVED.append(entry)
|
unresolved_links.append(entry)
|
||||||
|
|
||||||
if i % 100 == 0:
|
if i % 100 == 0:
|
||||||
print(f"{i}/{len(files)} analysées")
|
print(f"{i}/{len(files)} analysées")
|
||||||
|
|
||||||
# ======================
|
# --------------------------------------------------
|
||||||
# SAVE
|
# SAVE RESULTS
|
||||||
# ======================
|
# --------------------------------------------------
|
||||||
|
|
||||||
json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
|
json.dump(
|
||||||
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
|
resolved_links,
|
||||||
|
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
|
||||||
|
indent=2,
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
json.dump(
|
||||||
|
unresolved_links,
|
||||||
|
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
|
||||||
|
indent=2,
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
|
||||||
print("\n✅ LINK SCAN COMPLETE")
|
print("\n✅ LINK SCAN COMPLETE")
|
||||||
print("Resolved:", len(OUTPUT_RESOLVED))
|
print("Resolved:", len(resolved_links))
|
||||||
print("Unresolved:", len(OUTPUT_UNRESOLVED))
|
print("Unresolved:", len(unresolved_links))
|
||||||
Loading…
Add table
Add a link
Reference in a new issue