from pathlib import Path import json import re from bs4 import BeautifulSoup from urllib.parse import urlparse, parse_qs, unquote # -------------------------------------------------- # PATHS # -------------------------------------------------- PAGES_DIR = Path("../output/pages") REGISTRY_PATH = Path("../output/equivalence_registry.json") OUTPUT_DIR = Path("../output/link_scan") OUTPUT_DIR.mkdir(exist_ok=True) # -------------------------------------------------- # LOAD REGISTRY # -------------------------------------------------- registry = json.load(open(REGISTRY_PATH, encoding="utf-8")) equivalences = registry["equivalences"] canonical_pages = registry["canonical_pages"] valid_targets = set(canonical_pages.values()) # -------------------------------------------------- # HELPERS # -------------------------------------------------- def normalize_title(title: str | None): if not title: return None title = unquote(title) title = title.replace("_", " ") title = re.sub(r"\s+", " ", title.strip()) return title.casefold() # ------------------------- # Extract MediaWiki target # ------------------------- def extract_mediawiki_target(href: str): if not href: return None # ignore anchors if href.startswith("#"): return None parsed = urlparse(href) # external link if parsed.scheme in ("http", "https"): return None path = parsed.path or "" # /wiki/Page_Name if "/wiki/" in path: return path.split("/wiki/", 1)[1] # index.php?title=Page if "index.php" in path: qs = parse_qs(parsed.query) if "title" in qs: return qs["title"][0] # fallback filename-like return Path(path).stem # ------------------------- # Ignore unwanted namespaces # ------------------------- IGNORED_PREFIXES = ( "file:", "image:", "template:", "special:", "help:", "user:", "talk:", ) def is_ignored_namespace(title_norm: str): return title_norm.startswith(IGNORED_PREFIXES) # ------------------------- # Extract article content # ------------------------- def extract_article_links(soup): content = soup.find("div", id="mw-content-text") if not content: return [] links = [] for a in content.select("a[href]"): # ignore navboxes / metadata if a.find_parent(class_="navbox"): continue href = a.get("href") links.append(href) return links # -------------------------------------------------- # MAIN SCAN # -------------------------------------------------- resolved_links = [] unresolved_links = [] files = list(PAGES_DIR.glob("*.html")) print(f"{len(files)} pages à analyser") for i, file_path in enumerate(files, 1): html = file_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html, "html.parser") links = extract_article_links(soup) for href in links: raw_target = extract_mediawiki_target(href) norm = normalize_title(raw_target) if not norm: continue if is_ignored_namespace(norm): continue entry = { "source": file_path.name, "href": href, "normalized": norm, } resolved = equivalences.get(norm) if resolved: entry["resolved_title"] = resolved resolved_links.append(entry) else: unresolved_links.append(entry) if i % 100 == 0: print(f"{i}/{len(files)} analysées") # -------------------------------------------------- # SAVE RESULTS # -------------------------------------------------- json.dump( resolved_links, open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"), indent=2, ensure_ascii=False, ) json.dump( unresolved_links, open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"), indent=2, ensure_ascii=False, ) print("\n✅ LINK SCAN COMPLETE") print("Resolved:", len(resolved_links)) print("Unresolved:", len(unresolved_links))