From 8e9289998b111cbde36edf7e9fe8d60f24e4024a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Wed, 15 Apr 2026 10:36:21 +0200 Subject: [PATCH 1/3] keep error pages with fallback content --- extract_content.py | 64 +++++++++++++++++++++++++++++++++++++----- scan_internal_links.py | 13 ++++----- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/extract_content.py b/extract_content.py index 18ceabc..7d0b1df 100644 --- a/extract_content.py +++ b/extract_content.py @@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [ "border", ] +ERROR_PAGE_PATTERNS = [ + # 5xx + "503 service unavailable", + "502 bad gateway", + "500 internal server error", + "504 gateway time", + + # 4xx + "400 bad request", + "401 unauthorized", + "403 forbidden", + "404 not found", + "408 request time", + "419 page expired", + "429 too many requests", + + # génériques + "temporarily busy", + "server error", + "internal error", + "page not found", + "request could not be satisfied", +] + # ====================== # HELPERS # ====================== @@ -238,13 +262,32 @@ def remove_intro_rule_box(content): if getattr(el, "name", None) == "p": break +def is_error_page(soup: BeautifulSoup) -> bool: + text = soup.get_text(" ", strip=True).lower() + return any(p in text for p in ERROR_PAGE_PATTERNS) + +def build_fallback_html(title: str, filename: str) -> str: + safe_title = title or filename.replace("_", " ").replace(".html", "") + + return f""" + + + {safe_title} + + +

{safe_title}

+

Lost content (HTTracker) in {filename}

+ + +""" + # ====================== # CORE FUNCTIONS # ====================== def clean_html_file(input_path: Path, output_path: Path): - html = input_path.read_text(encoding="utf-8", errors="ignore") - soup = BeautifulSoup(html, "html.parser") + html_page = input_path.read_text(encoding="utf-8", errors="ignore") + soup = BeautifulSoup(html_page, "html.parser") # Remove comments (HTTrack etc.) for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): @@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path): for el in soup.find_all(tag): el.decompose() - # Extract main content content = soup.select_one("#mw-content-text") - if not content: + if not content or is_error_page(soup): print(f"[WARN] No content in {input_path.name}") + fallback = build_fallback_html( + title=soup.title.get_text(strip=True) if soup.title else "", + filename=input_path.name + ) + output_path.write_text(fallback, encoding="utf-8") return remove_intro_rule_box(content) @@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path): def process_all(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - - for file in SOURCE_DIR.glob("*.html"): + files = list(SOURCE_DIR.glob("*.html")) + total = len(files) + print(f"{total} fichiers trouvés") + for i, file in enumerate(files, start=1): output_file = OUTPUT_DIR / file.name clean_html_file(file, output_file) - + if i % 200 == 0 or i == total: + print(f"{i}/{total} analysés ({i/total:.1%})") print("✅ Cleaning complete") diff --git a/scan_internal_links.py b/scan_internal_links.py index 0858a57..25fc6d7 100644 --- a/scan_internal_links.py +++ b/scan_internal_links.py @@ -3,6 +3,7 @@ import json import re from bs4 import BeautifulSoup from urllib.parse import urlparse, parse_qs, unquote +import unicodedata # -------------------------------------------------- # PATHS @@ -29,16 +30,14 @@ valid_targets = set(canonical_pages.values()) # HELPERS # -------------------------------------------------- -def normalize_title(title: str | None): - if not title: - return None - - title = unquote(title) +def normalize_title(title: str) -> str: + title = title.strip() + title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ") - title = re.sub(r"\s+", " ", title.strip()) + title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') + title = re.sub(r"\s+", " ", title) return title.casefold() - # ------------------------- # Extract MediaWiki target # ------------------------- From 61d7f6b6463076d8c9b7845d2f2d3e2d1df0f2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Wed, 15 Apr 2026 12:10:32 +0200 Subject: [PATCH 2/3] avoid overwrite homonym canonicals --- prepare_pages_and_registry.py | 26 ++++++++++++++++---------- scan_internal_links.py | 4 +++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index f6517c6..1f0b471 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -9,7 +9,7 @@ from difflib import SequenceMatcher from bs4 import BeautifulSoup import unicodedata -SOURCE_DIR = Path("../test") +SOURCE_DIR = Path("../original_index") OUTPUT_DIR = Path("../output") PAGES_DIR = Path(OUTPUT_DIR / "pages") @@ -535,34 +535,40 @@ def title_to_filename(title: str) -> str: title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html" ) - +output_canonical_pages = {} +name_registry = {} copied = 0 +collision = 0 total = len(canonical_pages) for i, (article_id, data) in enumerate(canonical_pages.items(), 1): - src = data["path"] - dst_name = title_to_filename(data["title"]) - dst = PAGES_DIR / dst_name - + base_name = title_to_filename(data["title"]) + if base_name in name_registry: + base_name = Path(base_name).stem + base_name = f"{base_name}__{article_id}.html" + collision += 1 + problems.append(f"Resolved collision: {base_name} (from {src})") + name_registry[base_name] = article_id + dst = PAGES_DIR / base_name try: shutil.copy2(src, dst) - canonical_pages[article_id] = dst_name + output_canonical_pages[article_id] = base_name copied += 1 except Exception as e: problems.append(f"Copy failed {src}: {e}") - if i % 200 == 0 or i == total: print(f"{i}/{total} copiés") print(f"{copied} pages copiées") +print(f"{collision} collisions détectées") # -------------------------------------------------- # SAVE REGISTRY # -------------------------------------------------- registry = { - "canonical_pages": canonical_pages, + "canonical_pages": output_canonical_pages, "equivalences": equivalences, "potential_tags": potential_tags, "ignored_pages": ignored_pages, @@ -579,7 +585,7 @@ with open(REGISTRY_PATH, "w", encoding="utf-8") as f: with open(REPORT_PATH, "w", encoding="utf-8") as f: f.write("=== MIGRATION REPORT ===\n") - f.write(f"Canonical pages: {len(canonical_pages)}\n") + f.write(f"Canonical pages: {len(output_canonical_pages)}\n") f.write(f"Equivalences: {len(equivalences)}\n") f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Problems: {len(problems)}\n\n") diff --git a/scan_internal_links.py b/scan_internal_links.py index 25fc6d7..385e9a7 100644 --- a/scan_internal_links.py +++ b/scan_internal_links.py @@ -9,7 +9,7 @@ import unicodedata # PATHS # -------------------------------------------------- -PAGES_DIR = Path("../output/pages") +PAGES_DIR = Path("../output/cleaned_pages") REGISTRY_PATH = Path("../output/equivalence_registry.json") OUTPUT_DIR = Path("../output/link_scan") @@ -31,6 +31,8 @@ valid_targets = set(canonical_pages.values()) # -------------------------------------------------- def normalize_title(title: str) -> str: + if not title: + return title = title.strip() title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ") From 186492de859e069c81e823e7792f57057beab6cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Thu, 16 Apr 2026 10:04:58 +0200 Subject: [PATCH 3/3] WIP --- boostack_create_pages.py | 89 +++++++++++++++++++++++++++++ scan_internal_links.py | 120 ++++++++++++++++----------------------- 2 files changed, 137 insertions(+), 72 deletions(-) create mode 100644 boostack_create_pages.py diff --git a/boostack_create_pages.py b/boostack_create_pages.py new file mode 100644 index 0000000..0dd4f23 --- /dev/null +++ b/boostack_create_pages.py @@ -0,0 +1,89 @@ +import requests +import sys + +# ========================== +# CONFIGURATION +# ========================== + +BOOKSTACK_API_URL = "https://wiki-warmachine.ungol.fr/api" +API_TOKEN_ID = "VOTRE_TOKEN_ID" +API_TOKEN_SECRET = "VOTRE_TOKEN_SECRET" + +PAGES_FILE = "pages.txt" + +DEFAULT_CONTENT = """ +

Page restaurée automatiquement depuis l'ancien wiki.

+""" + +# ========================== +# HEADERS +# ========================== + +HEADERS = { + "Authorization": f"Token {API_TOKEN_ID}:{API_TOKEN_SECRET}", + "Content-Type": "application/json" +} + +# ========================== +# FUNCTIONS +# ========================== + +def create_page(title, chapter_id, content=DEFAULT_CONTENT): + """Create a page in BookStack""" + url = f"{BOOKSTACK_API_URL}/pages" + + payload = { + "name": title, + "html": content, + "chapter_id": int(chapter_id) + } + + response = requests.post(url, headers=HEADERS, json=payload) + + if response.status_code == 200: + page_id = response.json().get("id") + print(f"[OK] Page créée : '{title}' (ID {page_id})") + return page_id + else: + print(f"[ERREUR] Impossible de créer '{title}'") + print(response.status_code, response.text) + return None + + +def load_pages(filename): + """Load pages list from file""" + pages = [] + with open(filename, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + try: + title, chapter_id = line.split("|") + pages.append((title.strip(), chapter_id.strip())) + except ValueError: + print(f"[IGNORÉ] Ligne invalide : {line}") + return pages + + +# ========================== +# MAIN +# ========================== + +def main(): + pages = load_pages(PAGES_FILE) + + if not pages: + print("Aucune page à créer.") + sys.exit(0) + + print(f"{len(pages)} pages à créer...\n") + + for title, chapter_id in pages: + create_page(title, chapter_id) + + print("\nImport terminé.") + + +if __name__ == "__main__": + main() diff --git a/scan_internal_links.py b/scan_internal_links.py index 385e9a7..e74680b 100644 --- a/scan_internal_links.py +++ b/scan_internal_links.py @@ -6,14 +6,24 @@ from urllib.parse import urlparse, parse_qs, unquote import unicodedata # -------------------------------------------------- -# PATHS +# CONFIG # -------------------------------------------------- -PAGES_DIR = Path("../output/cleaned_pages") -REGISTRY_PATH = Path("../output/equivalence_registry.json") -OUTPUT_DIR = Path("../output/link_scan") +PAGES_DIR = Path("../output_ok/cleaned_pages") +REGISTRY_PATH = Path("../output_ok/equivalence_registry.json") +OUTPUT_DIR = Path("../output_ok/link_scan") OUTPUT_DIR.mkdir(exist_ok=True) +IGNORED_PREFIXES = ( + "file ", + "image ", + "category ", + "template ", + "special ", + "help ", + "user ", + "talk ", +) # -------------------------------------------------- # LOAD REGISTRY @@ -34,88 +44,68 @@ def normalize_title(title: str) -> str: if not title: return title = title.strip() + title = unquote(title) + title = Path(title).stem title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ") title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') title = re.sub(r"\s+", " ", title) return title.casefold() -# ------------------------- -# Extract MediaWiki target -# ------------------------- - def extract_mediawiki_target(href: str): - if not href: return None - - # ignore anchors if href.startswith("#"): return None - parsed = urlparse(href) - - # external link if parsed.scheme in ("http", "https"): return None - path = parsed.path or "" - - # /wiki/Page_Name if "/wiki/" in path: return path.split("/wiki/", 1)[1] - - # index.php?title=Page if "index.php" in path: qs = parse_qs(parsed.query) if "title" in qs: return qs["title"][0] - - # fallback filename-like return Path(path).stem -# ------------------------- -# Ignore unwanted namespaces -# ------------------------- - -IGNORED_PREFIXES = ( - "file:", - "image:", - "template:", - "special:", - "help:", - "user:", - "talk:", -) - def is_ignored_namespace(title_norm: str): return title_norm.startswith(IGNORED_PREFIXES) - -# ------------------------- -# Extract article content -# ------------------------- - def extract_article_links(soup): - content = soup.find("div", id="mw-content-text") if not content: return [] - links = [] - for a in content.select("a[href]"): - - # ignore navboxes / metadata if a.find_parent(class_="navbox"): continue - - href = a.get("href") - links.append(href) - + links.append({ + "href": a.get("href"), + "title": a.get("title"), + "text": a.get_text(strip=True), + }) return links +def resolve_link(raw_target, title_attr): + candidates = [] + if title_attr: + candidates.append(title_attr) + if raw_target: + candidates.append(raw_target) + for candidate in candidates: + norm = normalize_title(candidate) + if not norm: + continue + if is_ignored_namespace(norm): + return None, "ignored" + if norm in equivalences: + return equivalences[norm], "equivalence" + filename = norm.replace(" ", "_") + ".html" + if filename in valid_targets: + return filename, "direct" + return None, "unresolved" # -------------------------------------------------- # MAIN SCAN @@ -123,43 +113,29 @@ def extract_article_links(soup): resolved_links = [] unresolved_links = [] - files = list(PAGES_DIR.glob("*.html")) print(f"{len(files)} pages à analyser") for i, file_path in enumerate(files, 1): - html = file_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html, "html.parser") - links = extract_article_links(soup) - - for href in links: - - raw_target = extract_mediawiki_target(href) - norm = normalize_title(raw_target) - - if not norm: - continue - - if is_ignored_namespace(norm): - continue - + for link in links: + raw_target = extract_mediawiki_target(link["href"]) + resolved, method = resolve_link(raw_target, link["title"]) entry = { "source": file_path.name, - "href": href, - "normalized": norm, + "href": link["href"], + "title": link["title"], + "method": method, } - - resolved = equivalences.get(norm) - if resolved: - entry["resolved_title"] = resolved + entry["resolved"] = resolved resolved_links.append(entry) else: + entry["raw_target"] = raw_target unresolved_links.append(entry) - - if i % 100 == 0: + if i % 200 == 0: print(f"{i}/{len(files)} analysées") # --------------------------------------------------