2026-04-03 15:50:40 +02:00
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
import json
|
2026-04-07 15:06:30 +02:00
|
|
|
|
import re
|
2026-04-03 15:50:40 +02:00
|
|
|
|
from bs4 import BeautifulSoup
|
2026-04-07 15:06:30 +02:00
|
|
|
|
from urllib.parse import urlparse, parse_qs, unquote
|
2026-04-15 10:36:21 +02:00
|
|
|
|
import unicodedata
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
|
# --------------------------------------------------
|
2026-04-16 10:04:58 +02:00
|
|
|
|
# CONFIG
|
2026-04-07 15:06:30 +02:00
|
|
|
|
# --------------------------------------------------
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
2026-04-16 10:04:58 +02:00
|
|
|
|
PAGES_DIR = Path("../output_ok/cleaned_pages")
|
|
|
|
|
|
REGISTRY_PATH = Path("../output_ok/equivalence_registry.json")
|
|
|
|
|
|
OUTPUT_DIR = Path("../output_ok/link_scan")
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
2026-04-16 10:04:58 +02:00
|
|
|
|
IGNORED_PREFIXES = (
|
|
|
|
|
|
"file ",
|
|
|
|
|
|
"image ",
|
|
|
|
|
|
"category ",
|
|
|
|
|
|
"template ",
|
|
|
|
|
|
"special ",
|
|
|
|
|
|
"help ",
|
|
|
|
|
|
"user ",
|
|
|
|
|
|
"talk ",
|
|
|
|
|
|
)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
# LOAD REGISTRY
|
|
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
|
|
|
|
|
|
|
|
|
|
|
|
equivalences = registry["equivalences"]
|
|
|
|
|
|
canonical_pages = registry["canonical_pages"]
|
|
|
|
|
|
|
|
|
|
|
|
valid_targets = set(canonical_pages.values())
|
|
|
|
|
|
|
|
|
|
|
|
# --------------------------------------------------
|
2026-04-03 15:50:40 +02:00
|
|
|
|
# HELPERS
|
2026-04-07 15:06:30 +02:00
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
|
2026-04-15 10:36:21 +02:00
|
|
|
|
def normalize_title(title: str) -> str:
|
2026-04-15 12:10:32 +02:00
|
|
|
|
if not title:
|
|
|
|
|
|
return
|
2026-04-15 10:36:21 +02:00
|
|
|
|
title = title.strip()
|
2026-04-16 10:04:58 +02:00
|
|
|
|
title = unquote(title)
|
|
|
|
|
|
title = Path(title).stem
|
2026-04-15 10:36:21 +02:00
|
|
|
|
title = unicodedata.normalize("NFKC", title)
|
2026-04-07 15:06:30 +02:00
|
|
|
|
title = title.replace("_", " ")
|
2026-04-15 10:36:21 +02:00
|
|
|
|
title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
|
|
|
|
|
title = re.sub(r"\s+", " ", title)
|
2026-04-07 15:06:30 +02:00
|
|
|
|
return title.casefold()
|
|
|
|
|
|
|
|
|
|
|
|
def extract_mediawiki_target(href: str):
|
2026-04-03 15:50:40 +02:00
|
|
|
|
if not href:
|
|
|
|
|
|
return None
|
2026-04-07 15:06:30 +02:00
|
|
|
|
if href.startswith("#"):
|
|
|
|
|
|
return None
|
|
|
|
|
|
parsed = urlparse(href)
|
|
|
|
|
|
if parsed.scheme in ("http", "https"):
|
2026-04-03 15:50:40 +02:00
|
|
|
|
return None
|
2026-04-07 15:06:30 +02:00
|
|
|
|
path = parsed.path or ""
|
|
|
|
|
|
if "/wiki/" in path:
|
|
|
|
|
|
return path.split("/wiki/", 1)[1]
|
|
|
|
|
|
if "index.php" in path:
|
|
|
|
|
|
qs = parse_qs(parsed.query)
|
|
|
|
|
|
if "title" in qs:
|
|
|
|
|
|
return qs["title"][0]
|
|
|
|
|
|
return Path(path).stem
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
|
def is_ignored_namespace(title_norm: str):
|
|
|
|
|
|
return title_norm.startswith(IGNORED_PREFIXES)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
|
|
def extract_article_links(soup):
|
|
|
|
|
|
content = soup.find("div", id="mw-content-text")
|
|
|
|
|
|
if not content:
|
|
|
|
|
|
return []
|
|
|
|
|
|
links = []
|
2026-04-07 15:06:30 +02:00
|
|
|
|
for a in content.select("a[href]"):
|
|
|
|
|
|
if a.find_parent(class_="navbox"):
|
2026-04-03 15:50:40 +02:00
|
|
|
|
continue
|
2026-04-16 10:04:58 +02:00
|
|
|
|
links.append({
|
|
|
|
|
|
"href": a.get("href"),
|
|
|
|
|
|
"title": a.get("title"),
|
|
|
|
|
|
"text": a.get_text(strip=True),
|
|
|
|
|
|
})
|
2026-04-03 15:50:40 +02:00
|
|
|
|
return links
|
|
|
|
|
|
|
2026-04-16 10:04:58 +02:00
|
|
|
|
def resolve_link(raw_target, title_attr):
|
|
|
|
|
|
candidates = []
|
|
|
|
|
|
if title_attr:
|
|
|
|
|
|
candidates.append(title_attr)
|
|
|
|
|
|
if raw_target:
|
|
|
|
|
|
candidates.append(raw_target)
|
|
|
|
|
|
for candidate in candidates:
|
|
|
|
|
|
norm = normalize_title(candidate)
|
|
|
|
|
|
if not norm:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if is_ignored_namespace(norm):
|
|
|
|
|
|
return None, "ignored"
|
|
|
|
|
|
if norm in equivalences:
|
|
|
|
|
|
return equivalences[norm], "equivalence"
|
|
|
|
|
|
filename = norm.replace(" ", "_") + ".html"
|
|
|
|
|
|
if filename in valid_targets:
|
|
|
|
|
|
return filename, "direct"
|
|
|
|
|
|
return None, "unresolved"
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
# MAIN SCAN
|
|
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
resolved_links = []
|
|
|
|
|
|
unresolved_links = []
|
|
|
|
|
|
files = list(PAGES_DIR.glob("*.html"))
|
2026-04-03 15:50:40 +02:00
|
|
|
|
print(f"{len(files)} pages à analyser")
|
|
|
|
|
|
|
|
|
|
|
|
for i, file_path in enumerate(files, 1):
|
|
|
|
|
|
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
|
|
links = extract_article_links(soup)
|
2026-04-16 10:04:58 +02:00
|
|
|
|
for link in links:
|
|
|
|
|
|
raw_target = extract_mediawiki_target(link["href"])
|
|
|
|
|
|
resolved, method = resolve_link(raw_target, link["title"])
|
2026-04-03 15:50:40 +02:00
|
|
|
|
entry = {
|
|
|
|
|
|
"source": file_path.name,
|
2026-04-16 10:04:58 +02:00
|
|
|
|
"href": link["href"],
|
|
|
|
|
|
"title": link["title"],
|
|
|
|
|
|
"method": method,
|
2026-04-03 15:50:40 +02:00
|
|
|
|
}
|
|
|
|
|
|
if resolved:
|
2026-04-16 10:04:58 +02:00
|
|
|
|
entry["resolved"] = resolved
|
2026-04-07 15:06:30 +02:00
|
|
|
|
resolved_links.append(entry)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
else:
|
2026-04-16 10:04:58 +02:00
|
|
|
|
entry["raw_target"] = raw_target
|
2026-04-07 15:06:30 +02:00
|
|
|
|
unresolved_links.append(entry)
|
2026-04-16 10:04:58 +02:00
|
|
|
|
if i % 200 == 0:
|
2026-04-03 15:50:40 +02:00
|
|
|
|
print(f"{i}/{len(files)} analysées")
|
|
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
# SAVE RESULTS
|
|
|
|
|
|
# --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
json.dump(
|
|
|
|
|
|
resolved_links,
|
|
|
|
|
|
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
|
|
|
|
|
|
indent=2,
|
|
|
|
|
|
ensure_ascii=False,
|
|
|
|
|
|
)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
2026-04-07 15:06:30 +02:00
|
|
|
|
json.dump(
|
|
|
|
|
|
unresolved_links,
|
|
|
|
|
|
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
|
|
|
|
|
|
indent=2,
|
|
|
|
|
|
ensure_ascii=False,
|
|
|
|
|
|
)
|
2026-04-03 15:50:40 +02:00
|
|
|
|
|
|
|
|
|
|
print("\n✅ LINK SCAN COMPLETE")
|
2026-04-07 15:06:30 +02:00
|
|
|
|
print("Resolved:", len(resolved_links))
|
|
|
|
|
|
print("Unresolved:", len(unresolved_links))
|