2026-04-03 15:50:40 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
import json
|
2026-04-07 15:06:30 +02:00
|
|
|
import re
|
2026-04-03 15:50:40 +02:00
|
|
|
from bs4 import BeautifulSoup
|
2026-04-07 15:06:30 +02:00
|
|
|
from urllib.parse import urlparse, parse_qs, unquote
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# --------------------------------------------------
|
|
|
|
|
# PATHS
|
|
|
|
|
# --------------------------------------------------
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
PAGES_DIR = Path("../output/pages")
|
|
|
|
|
REGISTRY_PATH = Path("../output/equivalence_registry.json")
|
|
|
|
|
OUTPUT_DIR = Path("../output/link_scan")
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# --------------------------------------------------
|
|
|
|
|
# LOAD REGISTRY
|
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
|
|
|
|
|
|
|
|
|
|
equivalences = registry["equivalences"]
|
|
|
|
|
canonical_pages = registry["canonical_pages"]
|
|
|
|
|
|
|
|
|
|
valid_targets = set(canonical_pages.values())
|
|
|
|
|
|
|
|
|
|
# --------------------------------------------------
|
2026-04-03 15:50:40 +02:00
|
|
|
# HELPERS
|
2026-04-07 15:06:30 +02:00
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def normalize_title(title: str | None):
|
|
|
|
|
if not title:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
title = unquote(title)
|
|
|
|
|
title = title.replace("_", " ")
|
|
|
|
|
title = re.sub(r"\s+", " ", title.strip())
|
|
|
|
|
return title.casefold()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
|
|
|
|
# Extract MediaWiki target
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
|
|
|
|
def extract_mediawiki_target(href: str):
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
if not href:
|
|
|
|
|
return None
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# ignore anchors
|
|
|
|
|
if href.startswith("#"):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
parsed = urlparse(href)
|
|
|
|
|
|
|
|
|
|
# external link
|
|
|
|
|
if parsed.scheme in ("http", "https"):
|
2026-04-03 15:50:40 +02:00
|
|
|
return None
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
path = parsed.path or ""
|
|
|
|
|
|
|
|
|
|
# /wiki/Page_Name
|
|
|
|
|
if "/wiki/" in path:
|
|
|
|
|
return path.split("/wiki/", 1)[1]
|
|
|
|
|
|
|
|
|
|
# index.php?title=Page
|
|
|
|
|
if "index.php" in path:
|
|
|
|
|
qs = parse_qs(parsed.query)
|
|
|
|
|
if "title" in qs:
|
|
|
|
|
return qs["title"][0]
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# fallback filename-like
|
|
|
|
|
return Path(path).stem
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# -------------------------
|
|
|
|
|
# Ignore unwanted namespaces
|
|
|
|
|
# -------------------------
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
IGNORED_PREFIXES = (
|
|
|
|
|
"file:",
|
|
|
|
|
"image:",
|
|
|
|
|
"template:",
|
|
|
|
|
"special:",
|
|
|
|
|
"help:",
|
|
|
|
|
"user:",
|
|
|
|
|
"talk:",
|
|
|
|
|
)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
def is_ignored_namespace(title_norm: str):
|
|
|
|
|
return title_norm.startswith(IGNORED_PREFIXES)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# -------------------------
|
|
|
|
|
# Extract article content
|
|
|
|
|
# -------------------------
|
|
|
|
|
|
2026-04-03 15:50:40 +02:00
|
|
|
def extract_article_links(soup):
|
|
|
|
|
|
|
|
|
|
content = soup.find("div", id="mw-content-text")
|
|
|
|
|
if not content:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
links = []
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
for a in content.select("a[href]"):
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# ignore navboxes / metadata
|
|
|
|
|
if a.find_parent(class_="navbox"):
|
2026-04-03 15:50:40 +02:00
|
|
|
continue
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
href = a.get("href")
|
2026-04-03 15:50:40 +02:00
|
|
|
links.append(href)
|
|
|
|
|
|
|
|
|
|
return links
|
|
|
|
|
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# --------------------------------------------------
|
|
|
|
|
# MAIN SCAN
|
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
resolved_links = []
|
|
|
|
|
unresolved_links = []
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
files = list(PAGES_DIR.glob("*.html"))
|
2026-04-03 15:50:40 +02:00
|
|
|
print(f"{len(files)} pages à analyser")
|
|
|
|
|
|
|
|
|
|
for i, file_path in enumerate(files, 1):
|
|
|
|
|
|
|
|
|
|
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
|
|
|
|
|
|
links = extract_article_links(soup)
|
|
|
|
|
|
|
|
|
|
for href in links:
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
raw_target = extract_mediawiki_target(href)
|
|
|
|
|
norm = normalize_title(raw_target)
|
|
|
|
|
|
|
|
|
|
if not norm:
|
2026-04-03 15:50:40 +02:00
|
|
|
continue
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
if is_ignored_namespace(norm):
|
|
|
|
|
continue
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
entry = {
|
|
|
|
|
"source": file_path.name,
|
2026-04-07 15:06:30 +02:00
|
|
|
"href": href,
|
|
|
|
|
"normalized": norm,
|
2026-04-03 15:50:40 +02:00
|
|
|
}
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
resolved = equivalences.get(norm)
|
|
|
|
|
|
2026-04-03 15:50:40 +02:00
|
|
|
if resolved:
|
2026-04-07 15:06:30 +02:00
|
|
|
entry["resolved_title"] = resolved
|
|
|
|
|
resolved_links.append(entry)
|
2026-04-03 15:50:40 +02:00
|
|
|
else:
|
2026-04-07 15:06:30 +02:00
|
|
|
unresolved_links.append(entry)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
if i % 100 == 0:
|
|
|
|
|
print(f"{i}/{len(files)} analysées")
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
# --------------------------------------------------
|
|
|
|
|
# SAVE RESULTS
|
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
json.dump(
|
|
|
|
|
resolved_links,
|
|
|
|
|
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
|
|
|
|
|
indent=2,
|
|
|
|
|
ensure_ascii=False,
|
|
|
|
|
)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
json.dump(
|
|
|
|
|
unresolved_links,
|
|
|
|
|
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
|
|
|
|
|
indent=2,
|
|
|
|
|
ensure_ascii=False,
|
|
|
|
|
)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
print("\n✅ LINK SCAN COMPLETE")
|
2026-04-07 15:06:30 +02:00
|
|
|
print("Resolved:", len(resolved_links))
|
|
|
|
|
print("Unresolved:", len(unresolved_links))
|