tweak paths and find links

This commit is contained in:
maximator 2026-04-03 15:50:40 +02:00
parent 36c8bb2354
commit e4aaa33137
7 changed files with 309 additions and 10 deletions

View file

@ -1,8 +1,8 @@
import json
from collections import defaultdict
INPUT_FILE = "categories.json"
OUTPUT_FILE = "categories_analysis.json"
INPUT_FILE = "../categories.json"
OUTPUT_FILE = "../categories_analysis.json"
# patterns typiques MediaWiki bruit
IGNORE_PATTERNS = [

177
build_link_registry.py Normal file
View file

@ -0,0 +1,177 @@
from pathlib import Path
import json
import re
from bs4 import BeautifulSoup
# ======================
# CONFIG
# ======================
INPUT_DIR = Path("../unique_pages")
OUTPUT_DIR = Path("../link_registry")
OUTPUT_DIR.mkdir(exist_ok=True)
# MediaWiki-like prefixes
PREFIXES = [
"category:",
"category_",
"file:",
"template:",
]
# ======================
# HELPERS
# ======================
def normalize(title: str) -> str:
"""Canonical key normalization."""
if not title:
return ""
title = title.strip()
title = title.replace("_", " ")
title = re.sub(r"\s+", " ", title)
return title.lower()
def slugify(title: str) -> str:
return normalize(title).replace(" ", "_")
def strip_prefix(title: str) -> str:
t = title.lower()
for p in PREFIXES:
if t.startswith(p):
return title[len(p):]
return title
def extract_title(soup: BeautifulSoup):
"""Try multiple strategies to extract page title."""
# Strategy 1: MediaWiki heading
h1 = soup.find("h1", id="firstHeading")
if h1:
return h1.get_text(strip=True)
# Strategy 2: HTML title
if soup.title:
title = soup.title.get_text()
if "-" in title:
return title.split("-")[0].strip()
return title.strip()
return None
def detect_redirect(soup: BeautifulSoup):
"""Detect MediaWiki redirect pages."""
text = soup.get_text(" ", strip=True).lower()
# HTTrack redirects often contain this
if "#redirect" in text:
link = soup.find("a")
if link and link.get("href"):
return link["href"]
# alternative pattern
redirect_note = soup.find(class_="redirectText")
if redirect_note:
link = redirect_note.find("a")
if link:
return link.get("href")
return None
# ======================
# MAIN
# ======================
title_registry = {}
alias_registry = {}
redirects = {}
unresolved = []
files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} fichiers trouvés")
for i, file_path in enumerate(files, 1):
try:
html = file_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser")
title = extract_title(soup)
if not title:
unresolved.append({
"file": file_path.name,
"reason": "no_title_found"
})
continue
key = slugify(title)
# register canonical page
title_registry[key] = {
"title": title,
"file": str(file_path)
}
# detect redirect
redirect_href = detect_redirect(soup)
if redirect_href:
target = Path(redirect_href).stem
target_key = slugify(target)
redirects[key] = target_key
alias_registry[key] = target_key
except Exception as e:
unresolved.append({
"file": file_path.name,
"reason": str(e)
})
if i % 100 == 0:
print(f"{i}/{len(files)} traités")
# ======================
# AUTO ALIAS GENERATION
# ======================
auto_alias_count = 0
for key in list(title_registry.keys()):
stripped = slugify(strip_prefix(key))
if stripped != key and stripped in title_registry:
alias_registry[key] = stripped
auto_alias_count += 1
print(f"Alias automatiques ajoutés: {auto_alias_count}")
# ======================
# SAVE FILES
# ======================
with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f:
json.dump(title_registry, f, indent=2, ensure_ascii=False)
with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f:
json.dump(alias_registry, f, indent=2, ensure_ascii=False)
with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f:
json.dump(redirects, f, indent=2, ensure_ascii=False)
with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f:
json.dump(unresolved, f, indent=2, ensure_ascii=False)
print("\n✅ REGISTRY BUILD COMPLETE")
print(f"Pages uniques: {len(title_registry)}")
print(f"Alias: {len(alias_registry)}")
print(f"Redirects: {len(redirects)}")
print(f"Problèmes: {len(unresolved)}")

View file

@ -3,8 +3,8 @@ import re
from pathlib import Path
from collections import defaultdict, Counter
INPUT_DIR = Path(".") # dossier contenant les fichiers
OUTPUT_FILE = "category_analysis.json"
INPUT_DIR = Path("../unique_pages")
OUTPUT_FILE = "../category_analysis.json"
# ---------------------------

View file

@ -7,8 +7,8 @@ from collections import defaultdict
# CONFIG
# =========================
INPUT_DIR = "." # dossier contenant les 700 fichiers
OUTPUT_FILE = "categories.json"
INPUT_DIR = "../original_index" # dossier contenant les 700 fichiers
OUTPUT_FILE = "../categories.json"
# extensions à analyser
VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"}

View file

@ -4,8 +4,8 @@ import re
import json
INPUT_DIR = Path(".")
OUTPUT_DIR = Path("unique_pages")
INPUT_DIR = Path("../original_index")
OUTPUT_DIR = Path("../unique_pages")
OUTPUT_DIR.mkdir(exist_ok=True)
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')

122
scan_internal_links.py Normal file
View file

@ -0,0 +1,122 @@
from pathlib import Path
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
INPUT_DIR = Path("../unique_pages")
REGISTRY_DIR = Path("../link_registry")
title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
OUTPUT_RESOLVED = []
OUTPUT_UNRESOLVED = []
# ======================
# HELPERS
# ======================
def normalize_href(href: str):
if not href:
return None
# ignore external links
if href.startswith("http"):
return None
name = Path(href).stem
return name.lower()
def resolve(name):
if name in title_registry:
return name
if name in alias_registry:
return alias_registry[name]
# try removing category prefix
if name.startswith("category_"):
alt = name.replace("category_", "", 1)
if alt in title_registry:
return alt
return None
def extract_article_links(soup):
content = soup.find("div", id="mw-content-text")
if not content:
return []
links = []
for a in content.find_all("a", href=True):
href = a["href"]
# ignore anchors
if href.startswith("#"):
continue
# ignore files/images/history/etc
if any(prefix in href.lower() for prefix in [
"file_",
"image:",
"special:",
"action=",
]):
continue
links.append(href)
return links
# ======================
# MAIN
# ======================
files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser")
for i, file_path in enumerate(files, 1):
html = file_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser")
links = extract_article_links(soup)
for href in links:
key = normalize_href(href)
if not key:
continue
resolved = resolve(key)
entry = {
"source": file_path.name,
"link": href,
}
if resolved:
entry["target"] = resolved
OUTPUT_RESOLVED.append(entry)
else:
OUTPUT_UNRESOLVED.append(entry)
if i % 100 == 0:
print(f"{i}/{len(files)} analysées")
# ======================
# SAVE
# ======================
json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
print("\n✅ LINK SCAN COMPLETE")
print("Resolved:", len(OUTPUT_RESOLVED))
print("Unresolved:", len(OUTPUT_UNRESOLVED))

View file

@ -3,8 +3,8 @@ import shutil
import re
import json
INPUT_DIR = Path("unique_pages")
OUTPUT_DIR = Path("classified_pages")
INPUT_DIR = Path("../unique_pages")
OUTPUT_DIR = Path("../classified_pages")
CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')