tweak paths and find links
This commit is contained in:
parent
36c8bb2354
commit
e4aaa33137
7 changed files with 309 additions and 10 deletions
177
build_link_registry.py
Normal file
177
build_link_registry.py
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
from pathlib import Path
|
||||
import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# ======================
|
||||
# CONFIG
|
||||
# ======================
|
||||
|
||||
INPUT_DIR = Path("../unique_pages")
|
||||
OUTPUT_DIR = Path("../link_registry")
|
||||
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# MediaWiki-like prefixes
|
||||
PREFIXES = [
|
||||
"category:",
|
||||
"category_",
|
||||
"file:",
|
||||
"template:",
|
||||
]
|
||||
|
||||
# ======================
|
||||
# HELPERS
|
||||
# ======================
|
||||
|
||||
def normalize(title: str) -> str:
|
||||
"""Canonical key normalization."""
|
||||
if not title:
|
||||
return ""
|
||||
|
||||
title = title.strip()
|
||||
title = title.replace("_", " ")
|
||||
title = re.sub(r"\s+", " ", title)
|
||||
|
||||
return title.lower()
|
||||
|
||||
|
||||
def slugify(title: str) -> str:
|
||||
return normalize(title).replace(" ", "_")
|
||||
|
||||
|
||||
def strip_prefix(title: str) -> str:
|
||||
t = title.lower()
|
||||
for p in PREFIXES:
|
||||
if t.startswith(p):
|
||||
return title[len(p):]
|
||||
return title
|
||||
|
||||
|
||||
def extract_title(soup: BeautifulSoup):
|
||||
"""Try multiple strategies to extract page title."""
|
||||
|
||||
# Strategy 1: MediaWiki heading
|
||||
h1 = soup.find("h1", id="firstHeading")
|
||||
if h1:
|
||||
return h1.get_text(strip=True)
|
||||
|
||||
# Strategy 2: HTML title
|
||||
if soup.title:
|
||||
title = soup.title.get_text()
|
||||
if "-" in title:
|
||||
return title.split("-")[0].strip()
|
||||
return title.strip()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def detect_redirect(soup: BeautifulSoup):
|
||||
"""Detect MediaWiki redirect pages."""
|
||||
text = soup.get_text(" ", strip=True).lower()
|
||||
|
||||
# HTTrack redirects often contain this
|
||||
if "#redirect" in text:
|
||||
link = soup.find("a")
|
||||
if link and link.get("href"):
|
||||
return link["href"]
|
||||
|
||||
# alternative pattern
|
||||
redirect_note = soup.find(class_="redirectText")
|
||||
if redirect_note:
|
||||
link = redirect_note.find("a")
|
||||
if link:
|
||||
return link.get("href")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ======================
|
||||
# MAIN
|
||||
# ======================
|
||||
|
||||
title_registry = {}
|
||||
alias_registry = {}
|
||||
redirects = {}
|
||||
unresolved = []
|
||||
|
||||
files = list(INPUT_DIR.glob("*.html"))
|
||||
print(f"{len(files)} fichiers trouvés")
|
||||
|
||||
for i, file_path in enumerate(files, 1):
|
||||
|
||||
try:
|
||||
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
title = extract_title(soup)
|
||||
|
||||
if not title:
|
||||
unresolved.append({
|
||||
"file": file_path.name,
|
||||
"reason": "no_title_found"
|
||||
})
|
||||
continue
|
||||
|
||||
key = slugify(title)
|
||||
|
||||
# register canonical page
|
||||
title_registry[key] = {
|
||||
"title": title,
|
||||
"file": str(file_path)
|
||||
}
|
||||
|
||||
# detect redirect
|
||||
redirect_href = detect_redirect(soup)
|
||||
if redirect_href:
|
||||
target = Path(redirect_href).stem
|
||||
target_key = slugify(target)
|
||||
|
||||
redirects[key] = target_key
|
||||
alias_registry[key] = target_key
|
||||
|
||||
except Exception as e:
|
||||
unresolved.append({
|
||||
"file": file_path.name,
|
||||
"reason": str(e)
|
||||
})
|
||||
|
||||
if i % 100 == 0:
|
||||
print(f"{i}/{len(files)} traités")
|
||||
|
||||
# ======================
|
||||
# AUTO ALIAS GENERATION
|
||||
# ======================
|
||||
|
||||
auto_alias_count = 0
|
||||
|
||||
for key in list(title_registry.keys()):
|
||||
stripped = slugify(strip_prefix(key))
|
||||
|
||||
if stripped != key and stripped in title_registry:
|
||||
alias_registry[key] = stripped
|
||||
auto_alias_count += 1
|
||||
|
||||
print(f"Alias automatiques ajoutés: {auto_alias_count}")
|
||||
|
||||
# ======================
|
||||
# SAVE FILES
|
||||
# ======================
|
||||
|
||||
with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f:
|
||||
json.dump(title_registry, f, indent=2, ensure_ascii=False)
|
||||
|
||||
with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f:
|
||||
json.dump(alias_registry, f, indent=2, ensure_ascii=False)
|
||||
|
||||
with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f:
|
||||
json.dump(redirects, f, indent=2, ensure_ascii=False)
|
||||
|
||||
with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f:
|
||||
json.dump(unresolved, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print("\n✅ REGISTRY BUILD COMPLETE")
|
||||
print(f"Pages uniques: {len(title_registry)}")
|
||||
print(f"Alias: {len(alias_registry)}")
|
||||
print(f"Redirects: {len(redirects)}")
|
||||
print(f"Problèmes: {len(unresolved)}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue