from pathlib import Path import json import re from bs4 import BeautifulSoup from urllib.parse import urlparse, parse_qs, unquote import unicodedata # -------------------------------------------------- # CONFIG # -------------------------------------------------- PAGES_DIR = Path("../output_ok/cleaned_pages") REGISTRY_PATH = Path("../output_ok/equivalence_registry.json") OUTPUT_DIR = Path("../output_ok/link_scan") OUTPUT_DIR.mkdir(exist_ok=True) IGNORED_PREFIXES = ( "file ", "image ", "category ", "template ", "special ", "help ", "user ", "talk ", ) # -------------------------------------------------- # LOAD REGISTRY # -------------------------------------------------- registry = json.load(open(REGISTRY_PATH, encoding="utf-8")) equivalences = registry["equivalences"] canonical_pages = registry["canonical_pages"] valid_targets = set(canonical_pages.values()) # -------------------------------------------------- # HELPERS # -------------------------------------------------- def normalize_title(title: str) -> str: if not title: return title = title.strip() title = unquote(title) title = Path(title).stem title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ") title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') title = re.sub(r"\s+", " ", title) return title.casefold() def extract_mediawiki_target(href: str): if not href: return None if href.startswith("#"): return None parsed = urlparse(href) if parsed.scheme in ("http", "https"): return None path = parsed.path or "" if "/wiki/" in path: return path.split("/wiki/", 1)[1] if "index.php" in path: qs = parse_qs(parsed.query) if "title" in qs: return qs["title"][0] return Path(path).stem def is_ignored_namespace(title_norm: str): return title_norm.startswith(IGNORED_PREFIXES) def extract_article_links(soup): content = soup.find("div", id="mw-content-text") if not content: return [] links = [] for a in content.select("a[href]"): if a.find_parent(class_="navbox"): continue links.append({ "href": a.get("href"), "title": a.get("title"), "text": a.get_text(strip=True), }) return links def resolve_link(raw_target, title_attr): candidates = [] if title_attr: candidates.append(title_attr) if raw_target: candidates.append(raw_target) for candidate in candidates: norm = normalize_title(candidate) if not norm: continue if is_ignored_namespace(norm): return None, "ignored" if norm in equivalences: return equivalences[norm], "equivalence" filename = norm.replace(" ", "_") + ".html" if filename in valid_targets: return filename, "direct" return None, "unresolved" # -------------------------------------------------- # MAIN SCAN # -------------------------------------------------- resolved_links = [] unresolved_links = [] files = list(PAGES_DIR.glob("*.html")) print(f"{len(files)} pages à analyser") for i, file_path in enumerate(files, 1): html = file_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html, "html.parser") links = extract_article_links(soup) for link in links: raw_target = extract_mediawiki_target(link["href"]) resolved, method = resolve_link(raw_target, link["title"]) entry = { "source": file_path.name, "href": link["href"], "title": link["title"], "method": method, } if resolved: entry["resolved"] = resolved resolved_links.append(entry) else: entry["raw_target"] = raw_target unresolved_links.append(entry) if i % 200 == 0: print(f"{i}/{len(files)} analysées") # -------------------------------------------------- # SAVE RESULTS # -------------------------------------------------- json.dump( resolved_links, open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"), indent=2, ensure_ascii=False, ) json.dump( unresolved_links, open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"), indent=2, ensure_ascii=False, ) print("\n✅ LINK SCAN COMPLETE") print("Resolved:", len(resolved_links)) print("Unresolved:", len(unresolved_links))