whu_migration_scripts/scan_internal_links.py

161 lines
4.5 KiB
Python
Raw Normal View History

2026-04-03 15:50:40 +02:00
from pathlib import Path
import json
2026-04-07 15:06:30 +02:00
import re
2026-04-03 15:50:40 +02:00
from bs4 import BeautifulSoup
2026-04-07 15:06:30 +02:00
from urllib.parse import urlparse, parse_qs, unquote
2026-04-15 10:36:21 +02:00
import unicodedata
2026-04-03 15:50:40 +02:00
2026-04-07 15:06:30 +02:00
# --------------------------------------------------
2026-04-16 10:04:58 +02:00
# CONFIG
2026-04-07 15:06:30 +02:00
# --------------------------------------------------
2026-04-03 15:50:40 +02:00
2026-04-16 10:04:58 +02:00
PAGES_DIR = Path("../output_ok/cleaned_pages")
REGISTRY_PATH = Path("../output_ok/equivalence_registry.json")
OUTPUT_DIR = Path("../output_ok/link_scan")
2026-04-03 15:50:40 +02:00
2026-04-07 15:06:30 +02:00
OUTPUT_DIR.mkdir(exist_ok=True)
2026-04-16 10:04:58 +02:00
IGNORED_PREFIXES = (
"file ",
"image ",
"category ",
"template ",
"special ",
"help ",
"user ",
"talk ",
)
2026-04-03 15:50:40 +02:00
2026-04-07 15:06:30 +02:00
# --------------------------------------------------
# LOAD REGISTRY
# --------------------------------------------------
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
equivalences = registry["equivalences"]
canonical_pages = registry["canonical_pages"]
valid_targets = set(canonical_pages.values())
# --------------------------------------------------
2026-04-03 15:50:40 +02:00
# HELPERS
2026-04-07 15:06:30 +02:00
# --------------------------------------------------
2026-04-15 10:36:21 +02:00
def normalize_title(title: str) -> str:
2026-04-15 12:10:32 +02:00
if not title:
return
2026-04-15 10:36:21 +02:00
title = title.strip()
2026-04-16 10:04:58 +02:00
title = unquote(title)
title = Path(title).stem
2026-04-15 10:36:21 +02:00
title = unicodedata.normalize("NFKC", title)
2026-04-07 15:06:30 +02:00
title = title.replace("_", " ")
2026-04-15 10:36:21 +02:00
title = title.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
title = re.sub(r"\s+", " ", title)
2026-04-07 15:06:30 +02:00
return title.casefold()
def extract_mediawiki_target(href: str):
2026-04-03 15:50:40 +02:00
if not href:
return None
2026-04-07 15:06:30 +02:00
if href.startswith("#"):
return None
parsed = urlparse(href)
if parsed.scheme in ("http", "https"):
2026-04-03 15:50:40 +02:00
return None
2026-04-07 15:06:30 +02:00
path = parsed.path or ""
if "/wiki/" in path:
return path.split("/wiki/", 1)[1]
if "index.php" in path:
qs = parse_qs(parsed.query)
if "title" in qs:
return qs["title"][0]
return Path(path).stem
2026-04-03 15:50:40 +02:00
2026-04-07 15:06:30 +02:00
def is_ignored_namespace(title_norm: str):
return title_norm.startswith(IGNORED_PREFIXES)
2026-04-03 15:50:40 +02:00
def extract_article_links(soup):
content = soup.find("div", id="mw-content-text")
if not content:
return []
links = []
2026-04-07 15:06:30 +02:00
for a in content.select("a[href]"):
if a.find_parent(class_="navbox"):
2026-04-03 15:50:40 +02:00
continue
2026-04-16 10:04:58 +02:00
links.append({
"href": a.get("href"),
"title": a.get("title"),
"text": a.get_text(strip=True),
})
2026-04-03 15:50:40 +02:00
return links
2026-04-16 10:04:58 +02:00
def resolve_link(raw_target, title_attr):
candidates = []
if title_attr:
candidates.append(title_attr)
if raw_target:
candidates.append(raw_target)
for candidate in candidates:
norm = normalize_title(candidate)
if not norm:
continue
if is_ignored_namespace(norm):
return None, "ignored"
if norm in equivalences:
return equivalences[norm], "equivalence"
filename = norm.replace(" ", "_") + ".html"
if filename in valid_targets:
return filename, "direct"
return None, "unresolved"
2026-04-03 15:50:40 +02:00
2026-04-07 15:06:30 +02:00
# --------------------------------------------------
# MAIN SCAN
# --------------------------------------------------
resolved_links = []
unresolved_links = []
files = list(PAGES_DIR.glob("*.html"))
2026-04-03 15:50:40 +02:00
print(f"{len(files)} pages à analyser")
for i, file_path in enumerate(files, 1):
html = file_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser")
links = extract_article_links(soup)
2026-04-16 10:04:58 +02:00
for link in links:
raw_target = extract_mediawiki_target(link["href"])
resolved, method = resolve_link(raw_target, link["title"])
2026-04-03 15:50:40 +02:00
entry = {
"source": file_path.name,
2026-04-16 10:04:58 +02:00
"href": link["href"],
"title": link["title"],
"method": method,
2026-04-03 15:50:40 +02:00
}
if resolved:
2026-04-16 10:04:58 +02:00
entry["resolved"] = resolved
2026-04-07 15:06:30 +02:00
resolved_links.append(entry)
2026-04-03 15:50:40 +02:00
else:
2026-04-16 10:04:58 +02:00
entry["raw_target"] = raw_target
2026-04-07 15:06:30 +02:00
unresolved_links.append(entry)
2026-04-16 10:04:58 +02:00
if i % 200 == 0:
2026-04-03 15:50:40 +02:00
print(f"{i}/{len(files)} analysées")
2026-04-07 15:06:30 +02:00
# --------------------------------------------------
# SAVE RESULTS
# --------------------------------------------------
json.dump(
resolved_links,
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
2026-04-03 15:50:40 +02:00
2026-04-07 15:06:30 +02:00
json.dump(
unresolved_links,
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
indent=2,
ensure_ascii=False,
)
2026-04-03 15:50:40 +02:00
print("\n✅ LINK SCAN COMPLETE")
2026-04-07 15:06:30 +02:00
print("Resolved:", len(resolved_links))
print("Unresolved:", len(unresolved_links))