122 lines
2.7 KiB
Python
122 lines
2.7 KiB
Python
|
|
from pathlib import Path
|
||
|
|
import json
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
from urllib.parse import urlparse
|
||
|
|
|
||
|
|
INPUT_DIR = Path("../unique_pages")
|
||
|
|
REGISTRY_DIR = Path("../link_registry")
|
||
|
|
|
||
|
|
title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
|
||
|
|
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
|
||
|
|
|
||
|
|
OUTPUT_RESOLVED = []
|
||
|
|
OUTPUT_UNRESOLVED = []
|
||
|
|
|
||
|
|
# ======================
|
||
|
|
# HELPERS
|
||
|
|
# ======================
|
||
|
|
|
||
|
|
def normalize_href(href: str):
|
||
|
|
if not href:
|
||
|
|
return None
|
||
|
|
|
||
|
|
# ignore external links
|
||
|
|
if href.startswith("http"):
|
||
|
|
return None
|
||
|
|
|
||
|
|
name = Path(href).stem
|
||
|
|
return name.lower()
|
||
|
|
|
||
|
|
|
||
|
|
def resolve(name):
|
||
|
|
if name in title_registry:
|
||
|
|
return name
|
||
|
|
|
||
|
|
if name in alias_registry:
|
||
|
|
return alias_registry[name]
|
||
|
|
|
||
|
|
# try removing category prefix
|
||
|
|
if name.startswith("category_"):
|
||
|
|
alt = name.replace("category_", "", 1)
|
||
|
|
if alt in title_registry:
|
||
|
|
return alt
|
||
|
|
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def extract_article_links(soup):
|
||
|
|
|
||
|
|
content = soup.find("div", id="mw-content-text")
|
||
|
|
if not content:
|
||
|
|
return []
|
||
|
|
|
||
|
|
links = []
|
||
|
|
|
||
|
|
for a in content.find_all("a", href=True):
|
||
|
|
|
||
|
|
href = a["href"]
|
||
|
|
|
||
|
|
# ignore anchors
|
||
|
|
if href.startswith("#"):
|
||
|
|
continue
|
||
|
|
|
||
|
|
# ignore files/images/history/etc
|
||
|
|
if any(prefix in href.lower() for prefix in [
|
||
|
|
"file_",
|
||
|
|
"image:",
|
||
|
|
"special:",
|
||
|
|
"action=",
|
||
|
|
]):
|
||
|
|
continue
|
||
|
|
|
||
|
|
links.append(href)
|
||
|
|
|
||
|
|
return links
|
||
|
|
|
||
|
|
|
||
|
|
# ======================
|
||
|
|
# MAIN
|
||
|
|
# ======================
|
||
|
|
|
||
|
|
files = list(INPUT_DIR.glob("*.html"))
|
||
|
|
print(f"{len(files)} pages à analyser")
|
||
|
|
|
||
|
|
for i, file_path in enumerate(files, 1):
|
||
|
|
|
||
|
|
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||
|
|
soup = BeautifulSoup(html, "html.parser")
|
||
|
|
|
||
|
|
links = extract_article_links(soup)
|
||
|
|
|
||
|
|
for href in links:
|
||
|
|
|
||
|
|
key = normalize_href(href)
|
||
|
|
if not key:
|
||
|
|
continue
|
||
|
|
|
||
|
|
resolved = resolve(key)
|
||
|
|
|
||
|
|
entry = {
|
||
|
|
"source": file_path.name,
|
||
|
|
"link": href,
|
||
|
|
}
|
||
|
|
|
||
|
|
if resolved:
|
||
|
|
entry["target"] = resolved
|
||
|
|
OUTPUT_RESOLVED.append(entry)
|
||
|
|
else:
|
||
|
|
OUTPUT_UNRESOLVED.append(entry)
|
||
|
|
|
||
|
|
if i % 100 == 0:
|
||
|
|
print(f"{i}/{len(files)} analysées")
|
||
|
|
|
||
|
|
# ======================
|
||
|
|
# SAVE
|
||
|
|
# ======================
|
||
|
|
|
||
|
|
json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
|
||
|
|
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
|
||
|
|
|
||
|
|
print("\n✅ LINK SCAN COMPLETE")
|
||
|
|
print("Resolved:", len(OUTPUT_RESOLVED))
|
||
|
|
print("Unresolved:", len(OUTPUT_UNRESOLVED))
|