183 lines
No EOL
4.1 KiB
Python
183 lines
No EOL
4.1 KiB
Python
from pathlib import Path
|
||
import json
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from urllib.parse import urlparse, parse_qs, unquote
|
||
import unicodedata
|
||
|
||
# --------------------------------------------------
|
||
# PATHS
|
||
# --------------------------------------------------
|
||
|
||
PAGES_DIR = Path("../output/pages")
|
||
REGISTRY_PATH = Path("../output/equivalence_registry.json")
|
||
OUTPUT_DIR = Path("../output/link_scan")
|
||
|
||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||
|
||
# --------------------------------------------------
|
||
# LOAD REGISTRY
|
||
# --------------------------------------------------
|
||
|
||
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
|
||
|
||
equivalences = registry["equivalences"]
|
||
canonical_pages = registry["canonical_pages"]
|
||
|
||
valid_targets = set(canonical_pages.values())
|
||
|
||
# --------------------------------------------------
|
||
# HELPERS
|
||
# --------------------------------------------------
|
||
|
||
def normalize_title(title: str) -> str:
|
||
title = title.strip()
|
||
title = unicodedata.normalize("NFKC", title)
|
||
title = title.replace("_", " ")
|
||
title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
||
title = re.sub(r"\s+", " ", title)
|
||
return title.casefold()
|
||
|
||
# -------------------------
|
||
# Extract MediaWiki target
|
||
# -------------------------
|
||
|
||
def extract_mediawiki_target(href: str):
|
||
|
||
if not href:
|
||
return None
|
||
|
||
# ignore anchors
|
||
if href.startswith("#"):
|
||
return None
|
||
|
||
parsed = urlparse(href)
|
||
|
||
# external link
|
||
if parsed.scheme in ("http", "https"):
|
||
return None
|
||
|
||
path = parsed.path or ""
|
||
|
||
# /wiki/Page_Name
|
||
if "/wiki/" in path:
|
||
return path.split("/wiki/", 1)[1]
|
||
|
||
# index.php?title=Page
|
||
if "index.php" in path:
|
||
qs = parse_qs(parsed.query)
|
||
if "title" in qs:
|
||
return qs["title"][0]
|
||
|
||
# fallback filename-like
|
||
return Path(path).stem
|
||
|
||
|
||
# -------------------------
|
||
# Ignore unwanted namespaces
|
||
# -------------------------
|
||
|
||
IGNORED_PREFIXES = (
|
||
"file:",
|
||
"image:",
|
||
"template:",
|
||
"special:",
|
||
"help:",
|
||
"user:",
|
||
"talk:",
|
||
)
|
||
|
||
def is_ignored_namespace(title_norm: str):
|
||
return title_norm.startswith(IGNORED_PREFIXES)
|
||
|
||
|
||
# -------------------------
|
||
# Extract article content
|
||
# -------------------------
|
||
|
||
def extract_article_links(soup):
|
||
|
||
content = soup.find("div", id="mw-content-text")
|
||
if not content:
|
||
return []
|
||
|
||
links = []
|
||
|
||
for a in content.select("a[href]"):
|
||
|
||
# ignore navboxes / metadata
|
||
if a.find_parent(class_="navbox"):
|
||
continue
|
||
|
||
href = a.get("href")
|
||
links.append(href)
|
||
|
||
return links
|
||
|
||
|
||
# --------------------------------------------------
|
||
# MAIN SCAN
|
||
# --------------------------------------------------
|
||
|
||
resolved_links = []
|
||
unresolved_links = []
|
||
|
||
files = list(PAGES_DIR.glob("*.html"))
|
||
print(f"{len(files)} pages à analyser")
|
||
|
||
for i, file_path in enumerate(files, 1):
|
||
|
||
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
|
||
links = extract_article_links(soup)
|
||
|
||
for href in links:
|
||
|
||
raw_target = extract_mediawiki_target(href)
|
||
norm = normalize_title(raw_target)
|
||
|
||
if not norm:
|
||
continue
|
||
|
||
if is_ignored_namespace(norm):
|
||
continue
|
||
|
||
entry = {
|
||
"source": file_path.name,
|
||
"href": href,
|
||
"normalized": norm,
|
||
}
|
||
|
||
resolved = equivalences.get(norm)
|
||
|
||
if resolved:
|
||
entry["resolved_title"] = resolved
|
||
resolved_links.append(entry)
|
||
else:
|
||
unresolved_links.append(entry)
|
||
|
||
if i % 100 == 0:
|
||
print(f"{i}/{len(files)} analysées")
|
||
|
||
# --------------------------------------------------
|
||
# SAVE RESULTS
|
||
# --------------------------------------------------
|
||
|
||
json.dump(
|
||
resolved_links,
|
||
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
|
||
indent=2,
|
||
ensure_ascii=False,
|
||
)
|
||
|
||
json.dump(
|
||
unresolved_links,
|
||
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
|
||
indent=2,
|
||
ensure_ascii=False,
|
||
)
|
||
|
||
print("\n✅ LINK SCAN COMPLETE")
|
||
print("Resolved:", len(resolved_links))
|
||
print("Unresolved:", len(unresolved_links)) |