WIP

avoid overwrite homonym canonicals
keep error pages with fallback content
2026-04-16 10:04:58 +02:00 · 2026-04-15 12:10:32 +02:00 · 2026-04-15 10:36:21 +02:00
4 changed files with 216 additions and 94 deletions
--- a/boostack_create_pages.py
+++ b/boostack_create_pages.py
@ -0,0 +1,89 @@
 import requests
 import sys
 # ==========================
 # CONFIGURATION
 # ==========================
 BOOKSTACK_API_URL = "https://wiki-warmachine.ungol.fr/api"
 API_TOKEN_ID = "VOTRE_TOKEN_ID"
 API_TOKEN_SECRET = "VOTRE_TOKEN_SECRET"
 PAGES_FILE = "pages.txt"
 DEFAULT_CONTENT = """
 <p><em>Page restaurée automatiquement depuis l'ancien wiki.</em></p>
 """
 # ==========================
 # HEADERS
 # ==========================
 HEADERS = {
    "Authorization": f"Token {API_TOKEN_ID}:{API_TOKEN_SECRET}",
    "Content-Type": "application/json"
 }
 # ==========================
 # FUNCTIONS
 # ==========================
 def create_page(title, chapter_id, content=DEFAULT_CONTENT):
    """Create a page in BookStack"""
    url = f"{BOOKSTACK_API_URL}/pages"
    payload = {
        "name": title,
        "html": content,
        "chapter_id": int(chapter_id)
    }
    response = requests.post(url, headers=HEADERS, json=payload)
    if response.status_code == 200:
        page_id = response.json().get("id")
        print(f"[OK] Page créée : '{title}' (ID {page_id})")
        return page_id
    else:
        print(f"[ERREUR] Impossible de créer '{title}'")
        print(response.status_code, response.text)
        return None
 def load_pages(filename):
    """Load pages list from file"""
    pages = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            try:
                title, chapter_id = line.split("|")
                pages.append((title.strip(), chapter_id.strip()))
            except ValueError:
                print(f"[IGNORÉ] Ligne invalide : {line}")
    return pages
 # ==========================
 # MAIN
 # ==========================
 def main():
    pages = load_pages(PAGES_FILE)
    if not pages:
        print("Aucune page à créer.")
        sys.exit(0)
    print(f"{len(pages)} pages à créer...\n")
    for title, chapter_id in pages:
        create_page(title, chapter_id)
    print("\nImport terminé.")
 if __name__ == "__main__":
    main()
--- a/extract_content.py
+++ b/extract_content.py
@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
    "border",
 ]
 ERROR_PAGE_PATTERNS = [
    # 5xx
    "503 service unavailable",
    "502 bad gateway",
    "500 internal server error",
    "504 gateway time",
    # 4xx
    "400 bad request",
    "401 unauthorized",
    "403 forbidden",
    "404 not found",
    "408 request time",
    "419 page expired",
    "429 too many requests",
    # génériques
    "temporarily busy",
    "server error",
    "internal error",
    "page not found",
    "request could not be satisfied",
 ]
 # ======================
 # HELPERS
 # ======================
@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
        if getattr(el, "name", None) == "p":
            break
 def is_error_page(soup: BeautifulSoup) -> bool:
    text = soup.get_text(" ", strip=True).lower()
    return any(p in text for p in ERROR_PAGE_PATTERNS)
 def build_fallback_html(title: str, filename: str) -> str:
    safe_title = title or filename.replace("_", " ").replace(".html", "")
    return f"""<html>
 <head>
  <meta charset="utf-8">
  <title>{safe_title}</title>
 </head>
 <body>
  <h1>{safe_title}</h1>
  <p>Lost content (HTTracker) in {filename}</p>
 </body>
 </html>
 """
 # ======================
 # CORE FUNCTIONS
 # ======================
 def clean_html_file(input_path: Path, output_path: Path):
-    html = input_path.read_text(encoding="utf-8", errors="ignore")
+    html_page = input_path.read_text(encoding="utf-8", errors="ignore")
-    soup = BeautifulSoup(html, "html.parser")
+    soup = BeautifulSoup(html_page, "html.parser")
    # Remove comments (HTTrack etc.)
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
        for el in soup.find_all(tag):
            el.decompose()
    # Extract main content
    content = soup.select_one("#mw-content-text")
-    if not content:
+    if not content or is_error_page(soup):
        print(f"[WARN] No content in {input_path.name}")
        fallback = build_fallback_html(
            title=soup.title.get_text(strip=True) if soup.title else "",
            filename=input_path.name
        )
        output_path.write_text(fallback, encoding="utf-8")
        return
    remove_intro_rule_box(content)
@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):
 def process_all():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-
+    files = list(SOURCE_DIR.glob("*.html"))
-    for file in SOURCE_DIR.glob("*.html"):
+    total = len(files)
    print(f"{total} fichiers trouvés")
    for i, file in enumerate(files, start=1):
        output_file = OUTPUT_DIR / file.name
        clean_html_file(file, output_file)
-
+        if i % 200 == 0 or i == total:
            print(f"{i}/{total} analysés ({i/total:.1%})")
    print("✅ Cleaning complete")
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@ -9,7 +9,7 @@ from difflib import SequenceMatcher
 from bs4 import BeautifulSoup
 import unicodedata
-SOURCE_DIR = Path("../test")
+SOURCE_DIR = Path("../original_index")
 OUTPUT_DIR = Path("../output")
 PAGES_DIR = Path(OUTPUT_DIR / "pages")
@ -535,34 +535,40 @@ def title_to_filename(title: str) -> str:
        title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html"
    )
-
+output_canonical_pages = {}
 name_registry = {}
 copied = 0
 collision = 0
 total = len(canonical_pages)
 for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
    src = data["path"]
-    dst_name = title_to_filename(data["title"])
+    base_name = title_to_filename(data["title"])
-    dst = PAGES_DIR / dst_name
+    if base_name in name_registry:
-
+        base_name = Path(base_name).stem
        base_name = f"{base_name}__{article_id}.html"
        collision += 1
        problems.append(f"Resolved collision: {base_name} (from {src})")
    name_registry[base_name] = article_id
    dst = PAGES_DIR / base_name
    try:
        shutil.copy2(src, dst)
-        canonical_pages[article_id] = dst_name
+        output_canonical_pages[article_id] = base_name
        copied += 1
    except Exception as e:
        problems.append(f"Copy failed {src}: {e}")
    if i % 200 == 0 or i == total:
        print(f"{i}/{total} copiés")
 print(f"{copied} pages copiées")
 print(f"{collision} collisions détectées")
 # --------------------------------------------------
 # SAVE REGISTRY
 # --------------------------------------------------
 registry = {
-    "canonical_pages": canonical_pages,
+    "canonical_pages": output_canonical_pages,
    "equivalences": equivalences,
    "potential_tags": potential_tags,
    "ignored_pages": ignored_pages,
@ -579,7 +585,7 @@ with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
 with open(REPORT_PATH, "w", encoding="utf-8") as f:
    f.write("=== MIGRATION REPORT ===\n")
-    f.write(f"Canonical pages: {len(canonical_pages)}\n")
+    f.write(f"Canonical pages: {len(output_canonical_pages)}\n")
    f.write(f"Equivalences: {len(equivalences)}\n")
    f.write(f"Ignored: {len(ignored_pages)}\n")
    f.write(f"Problems: {len(problems)}\n\n")
--- a/scan_internal_links.py
+++ b/scan_internal_links.py
@ -3,16 +3,27 @@ import json
 import re
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, parse_qs, unquote
 import unicodedata
 # --------------------------------------------------
-# PATHS
+# CONFIG
 # --------------------------------------------------
-PAGES_DIR = Path("../output/pages")
+PAGES_DIR = Path("../output_ok/cleaned_pages")
-REGISTRY_PATH = Path("../output/equivalence_registry.json")
+REGISTRY_PATH = Path("../output_ok/equivalence_registry.json")
-OUTPUT_DIR = Path("../output/link_scan")
+OUTPUT_DIR = Path("../output_ok/link_scan")
 OUTPUT_DIR.mkdir(exist_ok=True)
 IGNORED_PREFIXES = (
    "file ",
    "image ",
    "category ",
    "template ",
    "special ",
    "help ",
    "user ",
    "talk ",
 )
 # --------------------------------------------------
 # LOAD REGISTRY
@ -29,92 +40,72 @@ valid_targets = set(canonical_pages.values())
 # HELPERS
 # --------------------------------------------------
-def normalize_title(title: str | None):
+def normalize_title(title: str) -> str:
    if not title:
-        return None
+        return
-
+    title = title.strip()
    title = unquote(title)
    title = Path(title).stem
    title = unicodedata.normalize("NFKC", title)
    title = title.replace("_", " ")
-    title = re.sub(r"\s+", " ", title.strip())
+    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    title = re.sub(r"\s+", " ", title)
    return title.casefold()
 # -------------------------
 # Extract MediaWiki target
 # -------------------------
 def extract_mediawiki_target(href: str):
    if not href:
        return None
    # ignore anchors
    if href.startswith("#"):
        return None
    parsed = urlparse(href)
    # external link
    if parsed.scheme in ("http", "https"):
        return None
    path = parsed.path or ""
    # /wiki/Page_Name
    if "/wiki/" in path:
        return path.split("/wiki/", 1)[1]
    # index.php?title=Page
    if "index.php" in path:
        qs = parse_qs(parsed.query)
        if "title" in qs:
            return qs["title"][0]
    # fallback filename-like
    return Path(path).stem
 # -------------------------
 # Ignore unwanted namespaces
 # -------------------------
 IGNORED_PREFIXES = (
    "file:",
    "image:",
    "template:",
    "special:",
    "help:",
    "user:",
    "talk:",
 )
 def is_ignored_namespace(title_norm: str):
    return title_norm.startswith(IGNORED_PREFIXES)
 # -------------------------
 # Extract article content
 # -------------------------
 def extract_article_links(soup):
    content = soup.find("div", id="mw-content-text")
    if not content:
        return []
    links = []
    for a in content.select("a[href]"):
        # ignore navboxes / metadata
        if a.find_parent(class_="navbox"):
            continue
-
+        links.append({
-        href = a.get("href")
+            "href": a.get("href"),
-        links.append(href)
+            "title": a.get("title"),
-
+            "text": a.get_text(strip=True),
        })
    return links
 def resolve_link(raw_target, title_attr):
    candidates = []
    if title_attr:
        candidates.append(title_attr)
    if raw_target:
        candidates.append(raw_target)
    for candidate in candidates:
        norm = normalize_title(candidate)
        if not norm:
            continue
        if is_ignored_namespace(norm):
            return None, "ignored"
        if norm in equivalences:
            return equivalences[norm], "equivalence"
        filename = norm.replace(" ", "_") + ".html"
        if filename in valid_targets:
            return filename, "direct"
    return None, "unresolved"
 # --------------------------------------------------
 # MAIN SCAN
@ -122,43 +113,29 @@ def extract_article_links(soup):
 resolved_links = []
 unresolved_links = []
 files = list(PAGES_DIR.glob("*.html"))
 print(f"{len(files)} pages à analyser")
 for i, file_path in enumerate(files, 1):
    html = file_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")
    links = extract_article_links(soup)
-
+    for link in links:
-    for href in links:
+        raw_target = extract_mediawiki_target(link["href"])
-
+        resolved, method = resolve_link(raw_target, link["title"])
        raw_target = extract_mediawiki_target(href)
        norm = normalize_title(raw_target)
        if not norm:
            continue
        if is_ignored_namespace(norm):
            continue
        entry = {
            "source": file_path.name,
-            "href": href,
+            "href": link["href"],
-            "normalized": norm,
+            "title": link["title"],
            "method": method,
        }
        resolved = equivalences.get(norm)
        if resolved:
-            entry["resolved_title"] = resolved
+            entry["resolved"] = resolved
            resolved_links.append(entry)
        else:
            entry["raw_target"] = raw_target
            unresolved_links.append(entry)
-
+    if i % 200 == 0:
    if i % 100 == 0:
        print(f"{i}/{len(files)} analysées")
 # --------------------------------------------------
Author	SHA1	Message	Date
Maxime Réaux	186492de85	WIP	2026-04-16 10:04:58 +02:00
Maxime Réaux	61d7f6b646	avoid overwrite homonym canonicals	2026-04-15 12:10:32 +02:00
Maxime Réaux	8e9289998b	keep error pages with fallback content	2026-04-15 10:36:21 +02:00