Compare commits

...

3 commits

Author SHA1 Message Date
Maxime Réaux
186492de85 WIP 2026-04-16 10:04:58 +02:00
Maxime Réaux
61d7f6b646 avoid overwrite homonym canonicals 2026-04-15 12:10:32 +02:00
Maxime Réaux
8e9289998b keep error pages with fallback content 2026-04-15 10:36:21 +02:00
4 changed files with 216 additions and 94 deletions

89
boostack_create_pages.py Normal file
View file

@ -0,0 +1,89 @@
import requests
import sys
# ==========================
# CONFIGURATION
# ==========================
BOOKSTACK_API_URL = "https://wiki-warmachine.ungol.fr/api"
API_TOKEN_ID = "VOTRE_TOKEN_ID"
API_TOKEN_SECRET = "VOTRE_TOKEN_SECRET"
PAGES_FILE = "pages.txt"
DEFAULT_CONTENT = """
<p><em>Page restaurée automatiquement depuis l'ancien wiki.</em></p>
"""
# ==========================
# HEADERS
# ==========================
HEADERS = {
"Authorization": f"Token {API_TOKEN_ID}:{API_TOKEN_SECRET}",
"Content-Type": "application/json"
}
# ==========================
# FUNCTIONS
# ==========================
def create_page(title, chapter_id, content=DEFAULT_CONTENT):
"""Create a page in BookStack"""
url = f"{BOOKSTACK_API_URL}/pages"
payload = {
"name": title,
"html": content,
"chapter_id": int(chapter_id)
}
response = requests.post(url, headers=HEADERS, json=payload)
if response.status_code == 200:
page_id = response.json().get("id")
print(f"[OK] Page créée : '{title}' (ID {page_id})")
return page_id
else:
print(f"[ERREUR] Impossible de créer '{title}'")
print(response.status_code, response.text)
return None
def load_pages(filename):
"""Load pages list from file"""
pages = []
with open(filename, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
try:
title, chapter_id = line.split("|")
pages.append((title.strip(), chapter_id.strip()))
except ValueError:
print(f"[IGNORÉ] Ligne invalide : {line}")
return pages
# ==========================
# MAIN
# ==========================
def main():
pages = load_pages(PAGES_FILE)
if not pages:
print("Aucune page à créer.")
sys.exit(0)
print(f"{len(pages)} pages à créer...\n")
for title, chapter_id in pages:
create_page(title, chapter_id)
print("\nImport terminé.")
if __name__ == "__main__":
main()

View file

@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
"border", "border",
] ]
ERROR_PAGE_PATTERNS = [
# 5xx
"503 service unavailable",
"502 bad gateway",
"500 internal server error",
"504 gateway time",
# 4xx
"400 bad request",
"401 unauthorized",
"403 forbidden",
"404 not found",
"408 request time",
"419 page expired",
"429 too many requests",
# génériques
"temporarily busy",
"server error",
"internal error",
"page not found",
"request could not be satisfied",
]
# ====================== # ======================
# HELPERS # HELPERS
# ====================== # ======================
@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
if getattr(el, "name", None) == "p": if getattr(el, "name", None) == "p":
break break
def is_error_page(soup: BeautifulSoup) -> bool:
text = soup.get_text(" ", strip=True).lower()
return any(p in text for p in ERROR_PAGE_PATTERNS)
def build_fallback_html(title: str, filename: str) -> str:
safe_title = title or filename.replace("_", " ").replace(".html", "")
return f"""<html>
<head>
<meta charset="utf-8">
<title>{safe_title}</title>
</head>
<body>
<h1>{safe_title}</h1>
<p>Lost content (HTTracker) in {filename}</p>
</body>
</html>
"""
# ====================== # ======================
# CORE FUNCTIONS # CORE FUNCTIONS
# ====================== # ======================
def clean_html_file(input_path: Path, output_path: Path): def clean_html_file(input_path: Path, output_path: Path):
html = input_path.read_text(encoding="utf-8", errors="ignore") html_page = input_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html_page, "html.parser")
# Remove comments (HTTrack etc.) # Remove comments (HTTrack etc.)
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
for el in soup.find_all(tag): for el in soup.find_all(tag):
el.decompose() el.decompose()
# Extract main content # Extract main content
content = soup.select_one("#mw-content-text") content = soup.select_one("#mw-content-text")
if not content: if not content or is_error_page(soup):
print(f"[WARN] No content in {input_path.name}") print(f"[WARN] No content in {input_path.name}")
fallback = build_fallback_html(
title=soup.title.get_text(strip=True) if soup.title else "",
filename=input_path.name
)
output_path.write_text(fallback, encoding="utf-8")
return return
remove_intro_rule_box(content) remove_intro_rule_box(content)
@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):
def process_all(): def process_all():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
files = list(SOURCE_DIR.glob("*.html"))
for file in SOURCE_DIR.glob("*.html"): total = len(files)
print(f"{total} fichiers trouvés")
for i, file in enumerate(files, start=1):
output_file = OUTPUT_DIR / file.name output_file = OUTPUT_DIR / file.name
clean_html_file(file, output_file) clean_html_file(file, output_file)
if i % 200 == 0 or i == total:
print(f"{i}/{total} analysés ({i/total:.1%})")
print("✅ Cleaning complete") print("✅ Cleaning complete")

View file

@ -9,7 +9,7 @@ from difflib import SequenceMatcher
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import unicodedata import unicodedata
SOURCE_DIR = Path("../test") SOURCE_DIR = Path("../original_index")
OUTPUT_DIR = Path("../output") OUTPUT_DIR = Path("../output")
PAGES_DIR = Path(OUTPUT_DIR / "pages") PAGES_DIR = Path(OUTPUT_DIR / "pages")
@ -535,34 +535,40 @@ def title_to_filename(title: str) -> str:
title.replace(" ", "_").replace("", "'").replace("", "'").replace("", '"').replace("", '"').casefold() + ".html" title.replace(" ", "_").replace("", "'").replace("", "'").replace("", '"').replace("", '"').casefold() + ".html"
) )
output_canonical_pages = {}
name_registry = {}
copied = 0 copied = 0
collision = 0
total = len(canonical_pages) total = len(canonical_pages)
for i, (article_id, data) in enumerate(canonical_pages.items(), 1): for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
src = data["path"] src = data["path"]
dst_name = title_to_filename(data["title"]) base_name = title_to_filename(data["title"])
dst = PAGES_DIR / dst_name if base_name in name_registry:
base_name = Path(base_name).stem
base_name = f"{base_name}__{article_id}.html"
collision += 1
problems.append(f"Resolved collision: {base_name} (from {src})")
name_registry[base_name] = article_id
dst = PAGES_DIR / base_name
try: try:
shutil.copy2(src, dst) shutil.copy2(src, dst)
canonical_pages[article_id] = dst_name output_canonical_pages[article_id] = base_name
copied += 1 copied += 1
except Exception as e: except Exception as e:
problems.append(f"Copy failed {src}: {e}") problems.append(f"Copy failed {src}: {e}")
if i % 200 == 0 or i == total: if i % 200 == 0 or i == total:
print(f"{i}/{total} copiés") print(f"{i}/{total} copiés")
print(f"{copied} pages copiées") print(f"{copied} pages copiées")
print(f"{collision} collisions détectées")
# -------------------------------------------------- # --------------------------------------------------
# SAVE REGISTRY # SAVE REGISTRY
# -------------------------------------------------- # --------------------------------------------------
registry = { registry = {
"canonical_pages": canonical_pages, "canonical_pages": output_canonical_pages,
"equivalences": equivalences, "equivalences": equivalences,
"potential_tags": potential_tags, "potential_tags": potential_tags,
"ignored_pages": ignored_pages, "ignored_pages": ignored_pages,
@ -579,7 +585,7 @@ with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
with open(REPORT_PATH, "w", encoding="utf-8") as f: with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write("=== MIGRATION REPORT ===\n") f.write("=== MIGRATION REPORT ===\n")
f.write(f"Canonical pages: {len(canonical_pages)}\n") f.write(f"Canonical pages: {len(output_canonical_pages)}\n")
f.write(f"Equivalences: {len(equivalences)}\n") f.write(f"Equivalences: {len(equivalences)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n") f.write(f"Problems: {len(problems)}\n\n")

View file

@ -3,16 +3,27 @@ import json
import re import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, unquote from urllib.parse import urlparse, parse_qs, unquote
import unicodedata
# -------------------------------------------------- # --------------------------------------------------
# PATHS # CONFIG
# -------------------------------------------------- # --------------------------------------------------
PAGES_DIR = Path("../output/pages") PAGES_DIR = Path("../output_ok/cleaned_pages")
REGISTRY_PATH = Path("../output/equivalence_registry.json") REGISTRY_PATH = Path("../output_ok/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan") OUTPUT_DIR = Path("../output_ok/link_scan")
OUTPUT_DIR.mkdir(exist_ok=True) OUTPUT_DIR.mkdir(exist_ok=True)
IGNORED_PREFIXES = (
"file ",
"image ",
"category ",
"template ",
"special ",
"help ",
"user ",
"talk ",
)
# -------------------------------------------------- # --------------------------------------------------
# LOAD REGISTRY # LOAD REGISTRY
@ -29,92 +40,72 @@ valid_targets = set(canonical_pages.values())
# HELPERS # HELPERS
# -------------------------------------------------- # --------------------------------------------------
def normalize_title(title: str | None): def normalize_title(title: str) -> str:
if not title: if not title:
return None return
title = title.strip()
title = unquote(title) title = unquote(title)
title = Path(title).stem
title = unicodedata.normalize("NFKC", title)
title = title.replace("_", " ") title = title.replace("_", " ")
title = re.sub(r"\s+", " ", title.strip()) title = title.replace("", "'").replace("", "'").replace("", '"').replace("", '"')
title = re.sub(r"\s+", " ", title)
return title.casefold() return title.casefold()
# -------------------------
# Extract MediaWiki target
# -------------------------
def extract_mediawiki_target(href: str): def extract_mediawiki_target(href: str):
if not href: if not href:
return None return None
# ignore anchors
if href.startswith("#"): if href.startswith("#"):
return None return None
parsed = urlparse(href) parsed = urlparse(href)
# external link
if parsed.scheme in ("http", "https"): if parsed.scheme in ("http", "https"):
return None return None
path = parsed.path or "" path = parsed.path or ""
# /wiki/Page_Name
if "/wiki/" in path: if "/wiki/" in path:
return path.split("/wiki/", 1)[1] return path.split("/wiki/", 1)[1]
# index.php?title=Page
if "index.php" in path: if "index.php" in path:
qs = parse_qs(parsed.query) qs = parse_qs(parsed.query)
if "title" in qs: if "title" in qs:
return qs["title"][0] return qs["title"][0]
# fallback filename-like
return Path(path).stem return Path(path).stem
# -------------------------
# Ignore unwanted namespaces
# -------------------------
IGNORED_PREFIXES = (
"file:",
"image:",
"template:",
"special:",
"help:",
"user:",
"talk:",
)
def is_ignored_namespace(title_norm: str): def is_ignored_namespace(title_norm: str):
return title_norm.startswith(IGNORED_PREFIXES) return title_norm.startswith(IGNORED_PREFIXES)
# -------------------------
# Extract article content
# -------------------------
def extract_article_links(soup): def extract_article_links(soup):
content = soup.find("div", id="mw-content-text") content = soup.find("div", id="mw-content-text")
if not content: if not content:
return [] return []
links = [] links = []
for a in content.select("a[href]"): for a in content.select("a[href]"):
# ignore navboxes / metadata
if a.find_parent(class_="navbox"): if a.find_parent(class_="navbox"):
continue continue
links.append({
href = a.get("href") "href": a.get("href"),
links.append(href) "title": a.get("title"),
"text": a.get_text(strip=True),
})
return links return links
def resolve_link(raw_target, title_attr):
candidates = []
if title_attr:
candidates.append(title_attr)
if raw_target:
candidates.append(raw_target)
for candidate in candidates:
norm = normalize_title(candidate)
if not norm:
continue
if is_ignored_namespace(norm):
return None, "ignored"
if norm in equivalences:
return equivalences[norm], "equivalence"
filename = norm.replace(" ", "_") + ".html"
if filename in valid_targets:
return filename, "direct"
return None, "unresolved"
# -------------------------------------------------- # --------------------------------------------------
# MAIN SCAN # MAIN SCAN
@ -122,43 +113,29 @@ def extract_article_links(soup):
resolved_links = [] resolved_links = []
unresolved_links = [] unresolved_links = []
files = list(PAGES_DIR.glob("*.html")) files = list(PAGES_DIR.glob("*.html"))
print(f"{len(files)} pages à analyser") print(f"{len(files)} pages à analyser")
for i, file_path in enumerate(files, 1): for i, file_path in enumerate(files, 1):
html = file_path.read_text(encoding="utf-8", errors="ignore") html = file_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
links = extract_article_links(soup) links = extract_article_links(soup)
for link in links:
for href in links: raw_target = extract_mediawiki_target(link["href"])
resolved, method = resolve_link(raw_target, link["title"])
raw_target = extract_mediawiki_target(href)
norm = normalize_title(raw_target)
if not norm:
continue
if is_ignored_namespace(norm):
continue
entry = { entry = {
"source": file_path.name, "source": file_path.name,
"href": href, "href": link["href"],
"normalized": norm, "title": link["title"],
"method": method,
} }
resolved = equivalences.get(norm)
if resolved: if resolved:
entry["resolved_title"] = resolved entry["resolved"] = resolved
resolved_links.append(entry) resolved_links.append(entry)
else: else:
entry["raw_target"] = raw_target
unresolved_links.append(entry) unresolved_links.append(entry)
if i % 200 == 0:
if i % 100 == 0:
print(f"{i}/{len(files)} analysées") print(f"{i}/{len(files)} analysées")
# -------------------------------------------------- # --------------------------------------------------