fix maaping canonical preference

This commit is contained in:
Maxime Réaux 2026-04-07 15:06:30 +02:00
parent 90dd3cc152
commit 7f019ed98c
2 changed files with 242 additions and 72 deletions

View file

@ -1,48 +1,100 @@
from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urlparse, parse_qs, unquote
INPUT_DIR = Path("../unique_pages")
REGISTRY_DIR = Path("../link_registry")
# --------------------------------------------------
# PATHS
# --------------------------------------------------
title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
PAGES_DIR = Path("../output/pages")
REGISTRY_PATH = Path("../output/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan")
OUTPUT_RESOLVED = []
OUTPUT_UNRESOLVED = []
OUTPUT_DIR.mkdir(exist_ok=True)
# ======================
# --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]
valid_targets = set(canonical_pages.values())
# --------------------------------------------------
# HELPERS
# ======================
# --------------------------------------------------
def normalize_title(title: str | None):
if not title:
return None
title = unquote(title)
title = title.replace("_", " ")
title = re.sub(r"\s+", " ", title.strip())
return title.casefold()
# -------------------------
# Extract MediaWiki target
# -------------------------
def extract_mediawiki_target(href: str):
def normalize_href(href: str):
if not href:
return None
# ignore external links
if href.startswith("http"):
# ignore anchors
if href.startswith("#"):
return None
name = Path(href).stem
return name.lower()
parsed = urlparse(href)
# external link
if parsed.scheme in ("http", "https"):
return None
path = parsed.path or ""
# /wiki/Page_Name
if "/wiki/" in path:
return path.split("/wiki/", 1)[1]
# index.php?title=Page
if "index.php" in path:
qs = parse_qs(parsed.query)
if "title" in qs:
return qs["title"][0]
# fallback filename-like
return Path(path).stem
def resolve(name):
if name in title_registry:
return name
# -------------------------
# Ignore unwanted namespaces
# -------------------------
if name in alias_registry:
return alias_registry[name]
IGNORED_PREFIXES = (
"file:",
"image:",
"template:",
"special:",
"help:",
"user:",
"talk:",
)
# try removing category prefix
if name.startswith("category_"):
alt = name.replace("category_", "", 1)
if alt in title_registry:
return alt
def is_ignored_namespace(title_norm: str):
return title_norm.startswith(IGNORED_PREFIXES)
return None
# -------------------------
# Extract article content
# -------------------------
def extract_article_links(soup):
@ -52,33 +104,26 @@ def extract_article_links(soup):
links = []
for a in content.find_all("a", href=True):
for a in content.select("a[href]"):
href = a["href"]
# ignore anchors
if href.startswith("#"):
continue
# ignore files/images/history/etc
if any(prefix in href.lower() for prefix in [
"file_",
"image:",
"special:",
"action=",
]):
# ignore navboxes / metadata
if a.find_parent(class_="navbox"):
continue
href = a.get("href")
links.append(href)
return links
# ======================
# MAIN
# ======================
# --------------------------------------------------
# MAIN SCAN
# --------------------------------------------------
files = list(INPUT_DIR.glob("*.html"))
resolved_links = []
unresolved_links = []
files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")
for i, file_path in enumerate(files, 1):
@ -90,33 +135,50 @@ for i, file_path in enumerate(files, 1):
for href in links:
key = normalize_href(href)
if not key:
raw_target = extract_mediawiki_target(href)
norm = normalize_title(raw_target)
if not norm:
continue
resolved = resolve(key)
if is_ignored_namespace(norm):
continue
entry = {
"source": file_path.name,
"link": href,
"href": href,
"normalized": norm,
}
resolved = equivalences.get(norm)
if resolved:
entry["target"] = resolved
OUTPUT_RESOLVED.append(entry)
entry["resolved_title"] = resolved
resolved_links.append(entry)
else:
OUTPUT_UNRESOLVED.append(entry)
unresolved_links.append(entry)
if i % 100 == 0:
print(f"{i}/{len(files)} analysées")
# ======================
# SAVE
# ======================
# --------------------------------------------------
# SAVE RESULTS
# --------------------------------------------------
json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
json.dump(
resolved_links,
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
json.dump(
unresolved_links,
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(OUTPUT_RESOLVED))
print("Unresolved:", len(OUTPUT_UNRESOLVED))
print("Resolved:", len(resolved_links))
print("Unresolved:", len(unresolved_links))