from pathlib import Path import json import re from bs4 import BeautifulSoup # ====================== # CONFIG # ====================== INPUT_DIR = Path("../unique_pages") OUTPUT_DIR = Path("../link_registry") OUTPUT_DIR.mkdir(exist_ok=True) # MediaWiki-like prefixes PREFIXES = [ "category:", "category_", "file:", "template:", ] # ====================== # HELPERS # ====================== def normalize(title: str) -> str: """Canonical key normalization.""" if not title: return "" title = title.strip() title = title.replace("_", " ") title = re.sub(r"\s+", " ", title) return title.lower() def slugify(title: str) -> str: return normalize(title).replace(" ", "_") def strip_prefix(title: str) -> str: t = title.lower() for p in PREFIXES: if t.startswith(p): return title[len(p):] return title def extract_title(soup: BeautifulSoup): """Try multiple strategies to extract page title.""" # Strategy 1: MediaWiki heading h1 = soup.find("h1", id="firstHeading") if h1: return h1.get_text(strip=True) # Strategy 2: HTML title if soup.title: title = soup.title.get_text() if "-" in title: return title.split("-")[0].strip() return title.strip() return None def detect_redirect(soup: BeautifulSoup): """Detect MediaWiki redirect pages.""" text = soup.get_text(" ", strip=True).lower() # HTTrack redirects often contain this if "#redirect" in text: link = soup.find("a") if link and link.get("href"): return link["href"] # alternative pattern redirect_note = soup.find(class_="redirectText") if redirect_note: link = redirect_note.find("a") if link: return link.get("href") return None # ====================== # MAIN # ====================== title_registry = {} alias_registry = {} redirects = {} unresolved = [] files = list(INPUT_DIR.glob("*.html")) print(f"{len(files)} fichiers trouvés") for i, file_path in enumerate(files, 1): try: html = file_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html, "html.parser") title = extract_title(soup) if not title: unresolved.append({ "file": file_path.name, "reason": "no_title_found" }) continue key = slugify(title) # register canonical page title_registry[key] = { "title": title, "file": str(file_path) } # detect redirect redirect_href = detect_redirect(soup) if redirect_href: target = Path(redirect_href).stem target_key = slugify(target) redirects[key] = target_key alias_registry[key] = target_key except Exception as e: unresolved.append({ "file": file_path.name, "reason": str(e) }) if i % 100 == 0: print(f"{i}/{len(files)} traités") # ====================== # AUTO ALIAS GENERATION # ====================== auto_alias_count = 0 for key in list(title_registry.keys()): stripped = slugify(strip_prefix(key)) if stripped != key and stripped in title_registry: alias_registry[key] = stripped auto_alias_count += 1 print(f"Alias automatiques ajoutés: {auto_alias_count}") # ====================== # SAVE FILES # ====================== with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f: json.dump(title_registry, f, indent=2, ensure_ascii=False) with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f: json.dump(alias_registry, f, indent=2, ensure_ascii=False) with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f: json.dump(redirects, f, indent=2, ensure_ascii=False) with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f: json.dump(unresolved, f, indent=2, ensure_ascii=False) print("\n✅ REGISTRY BUILD COMPLETE") print(f"Pages uniques: {len(title_registry)}") print(f"Alias: {len(alias_registry)}") print(f"Redirects: {len(redirects)}") print(f"Problèmes: {len(unresolved)}")