from pathlib import Path import json from bs4 import BeautifulSoup from urllib.parse import urlparse INPUT_DIR = Path("../unique_pages") REGISTRY_DIR = Path("../link_registry") title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8")) alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8")) OUTPUT_RESOLVED = [] OUTPUT_UNRESOLVED = [] # ====================== # HELPERS # ====================== def normalize_href(href: str): if not href: return None # ignore external links if href.startswith("http"): return None name = Path(href).stem return name.lower() def resolve(name): if name in title_registry: return name if name in alias_registry: return alias_registry[name] # try removing category prefix if name.startswith("category_"): alt = name.replace("category_", "", 1) if alt in title_registry: return alt return None def extract_article_links(soup): content = soup.find("div", id="mw-content-text") if not content: return [] links = [] for a in content.find_all("a", href=True): href = a["href"] # ignore anchors if href.startswith("#"): continue # ignore files/images/history/etc if any(prefix in href.lower() for prefix in [ "file_", "image:", "special:", "action=", ]): continue links.append(href) return links # ====================== # MAIN # ====================== files = list(INPUT_DIR.glob("*.html")) print(f"{len(files)} pages à analyser") for i, file_path in enumerate(files, 1): html = file_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html, "html.parser") links = extract_article_links(soup) for href in links: key = normalize_href(href) if not key: continue resolved = resolve(key) entry = { "source": file_path.name, "link": href, } if resolved: entry["target"] = resolved OUTPUT_RESOLVED.append(entry) else: OUTPUT_UNRESOLVED.append(entry) if i % 100 == 0: print(f"{i}/{len(files)} analysées") # ====================== # SAVE # ====================== json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2) json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2) print("\n✅ LINK SCAN COMPLETE") print("Resolved:", len(OUTPUT_RESOLVED)) print("Unresolved:", len(OUTPUT_UNRESOLVED))