whu_migration_scripts/scan_internal_links.py

122 lines
No EOL
2.7 KiB
Python

from pathlib import Path
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
INPUT_DIR = Path("../unique_pages")
REGISTRY_DIR = Path("../link_registry")
title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
OUTPUT_RESOLVED = []
OUTPUT_UNRESOLVED = []
# ======================
# HELPERS
# ======================
def normalize_href(href: str):
if not href:
return None
# ignore external links
if href.startswith("http"):
return None
name = Path(href).stem
return name.lower()
def resolve(name):
if name in title_registry:
return name
if name in alias_registry:
return alias_registry[name]
# try removing category prefix
if name.startswith("category_"):
alt = name.replace("category_", "", 1)
if alt in title_registry:
return alt
return None
def extract_article_links(soup):
content = soup.find("div", id="mw-content-text")
if not content:
return []
links = []
for a in content.find_all("a", href=True):
href = a["href"]
# ignore anchors
if href.startswith("#"):
continue
# ignore files/images/history/etc
if any(prefix in href.lower() for prefix in [
"file_",
"image:",
"special:",
"action=",
]):
continue
links.append(href)
return links
# ======================
# MAIN
# ======================
files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")
for i, file_path in enumerate(files, 1):
html = file_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser")
links = extract_article_links(soup)
for href in links:
key = normalize_href(href)
if not key:
continue
resolved = resolve(key)
entry = {
"source": file_path.name,
"link": href,
}
if resolved:
entry["target"] = resolved
OUTPUT_RESOLVED.append(entry)
else:
OUTPUT_UNRESOLVED.append(entry)
if i % 100 == 0:
print(f"{i}/{len(files)} analysées")
# ======================
# SAVE
# ======================
json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(OUTPUT_RESOLVED))
print("Unresolved:", len(OUTPUT_UNRESOLVED))