diff --git a/boostack_create_pages.py b/boostack_create_pages.py deleted file mode 100644 index 0dd4f23..0000000 --- a/boostack_create_pages.py +++ /dev/null @@ -1,89 +0,0 @@ -import requests -import sys - -# ========================== -# CONFIGURATION -# ========================== - -BOOKSTACK_API_URL = "https://wiki-warmachine.ungol.fr/api" -API_TOKEN_ID = "VOTRE_TOKEN_ID" -API_TOKEN_SECRET = "VOTRE_TOKEN_SECRET" - -PAGES_FILE = "pages.txt" - -DEFAULT_CONTENT = """ -

Page restaurée automatiquement depuis l'ancien wiki.

-""" - -# ========================== -# HEADERS -# ========================== - -HEADERS = { - "Authorization": f"Token {API_TOKEN_ID}:{API_TOKEN_SECRET}", - "Content-Type": "application/json" -} - -# ========================== -# FUNCTIONS -# ========================== - -def create_page(title, chapter_id, content=DEFAULT_CONTENT): - """Create a page in BookStack""" - url = f"{BOOKSTACK_API_URL}/pages" - - payload = { - "name": title, - "html": content, - "chapter_id": int(chapter_id) - } - - response = requests.post(url, headers=HEADERS, json=payload) - - if response.status_code == 200: - page_id = response.json().get("id") - print(f"[OK] Page créée : '{title}' (ID {page_id})") - return page_id - else: - print(f"[ERREUR] Impossible de créer '{title}'") - print(response.status_code, response.text) - return None - - -def load_pages(filename): - """Load pages list from file""" - pages = [] - with open(filename, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line or line.startswith("#"): - continue - try: - title, chapter_id = line.split("|") - pages.append((title.strip(), chapter_id.strip())) - except ValueError: - print(f"[IGNORÉ] Ligne invalide : {line}") - return pages - - -# ========================== -# MAIN -# ========================== - -def main(): - pages = load_pages(PAGES_FILE) - - if not pages: - print("Aucune page à créer.") - sys.exit(0) - - print(f"{len(pages)} pages à créer...\n") - - for title, chapter_id in pages: - create_page(title, chapter_id) - - print("\nImport terminé.") - - -if __name__ == "__main__": - main() diff --git a/extract_content.py b/extract_content.py index 7d0b1df..18ceabc 100644 --- a/extract_content.py +++ b/extract_content.py @@ -51,30 +51,6 @@ STRIP_ATTRIBUTES = [ "border", ] -ERROR_PAGE_PATTERNS = [ - # 5xx - "503 service unavailable", - "502 bad gateway", - "500 internal server error", - "504 gateway time", - - # 4xx - "400 bad request", - "401 unauthorized", - "403 forbidden", - "404 not found", - "408 request time", - "419 page expired", - "429 too many requests", - - # génériques - "temporarily busy", - "server error", - "internal error", - "page not found", - "request could not be satisfied", -] - # ====================== # HELPERS # ====================== @@ -262,32 +238,13 @@ def remove_intro_rule_box(content): if getattr(el, "name", None) == "p": break -def is_error_page(soup: BeautifulSoup) -> bool: - text = soup.get_text(" ", strip=True).lower() - return any(p in text for p in ERROR_PAGE_PATTERNS) - -def build_fallback_html(title: str, filename: str) -> str: - safe_title = title or filename.replace("_", " ").replace(".html", "") - - return f""" - - - {safe_title} - - -

{safe_title}

-

Lost content (HTTracker) in {filename}

- - -""" - # ====================== # CORE FUNCTIONS # ====================== def clean_html_file(input_path: Path, output_path: Path): - html_page = input_path.read_text(encoding="utf-8", errors="ignore") - soup = BeautifulSoup(html_page, "html.parser") + html = input_path.read_text(encoding="utf-8", errors="ignore") + soup = BeautifulSoup(html, "html.parser") # Remove comments (HTTrack etc.) for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): @@ -298,15 +255,11 @@ def clean_html_file(input_path: Path, output_path: Path): for el in soup.find_all(tag): el.decompose() + # Extract main content content = soup.select_one("#mw-content-text") - if not content or is_error_page(soup): + if not content: print(f"[WARN] No content in {input_path.name}") - fallback = build_fallback_html( - title=soup.title.get_text(strip=True) if soup.title else "", - filename=input_path.name - ) - output_path.write_text(fallback, encoding="utf-8") return remove_intro_rule_box(content) @@ -409,14 +362,11 @@ def clean_html_file(input_path: Path, output_path: Path): def process_all(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - files = list(SOURCE_DIR.glob("*.html")) - total = len(files) - print(f"{total} fichiers trouvés") - for i, file in enumerate(files, start=1): + + for file in SOURCE_DIR.glob("*.html"): output_file = OUTPUT_DIR / file.name clean_html_file(file, output_file) - if i % 200 == 0 or i == total: - print(f"{i}/{total} analysés ({i/total:.1%})") + print("✅ Cleaning complete") diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index 1f0b471..f6517c6 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -9,7 +9,7 @@ from difflib import SequenceMatcher from bs4 import BeautifulSoup import unicodedata -SOURCE_DIR = Path("../original_index") +SOURCE_DIR = Path("../test") OUTPUT_DIR = Path("../output") PAGES_DIR = Path(OUTPUT_DIR / "pages") @@ -535,40 +535,34 @@ def title_to_filename(title: str) -> str: title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html" ) -output_canonical_pages = {} -name_registry = {} + copied = 0 -collision = 0 total = len(canonical_pages) for i, (article_id, data) in enumerate(canonical_pages.items(), 1): + src = data["path"] - base_name = title_to_filename(data["title"]) - if base_name in name_registry: - base_name = Path(base_name).stem - base_name = f"{base_name}__{article_id}.html" - collision += 1 - problems.append(f"Resolved collision: {base_name} (from {src})") - name_registry[base_name] = article_id - dst = PAGES_DIR / base_name + dst_name = title_to_filename(data["title"]) + dst = PAGES_DIR / dst_name + try: shutil.copy2(src, dst) - output_canonical_pages[article_id] = base_name + canonical_pages[article_id] = dst_name copied += 1 except Exception as e: problems.append(f"Copy failed {src}: {e}") + if i % 200 == 0 or i == total: print(f"{i}/{total} copiés") print(f"{copied} pages copiées") -print(f"{collision} collisions détectées") # -------------------------------------------------- # SAVE REGISTRY # -------------------------------------------------- registry = { - "canonical_pages": output_canonical_pages, + "canonical_pages": canonical_pages, "equivalences": equivalences, "potential_tags": potential_tags, "ignored_pages": ignored_pages, @@ -585,7 +579,7 @@ with open(REGISTRY_PATH, "w", encoding="utf-8") as f: with open(REPORT_PATH, "w", encoding="utf-8") as f: f.write("=== MIGRATION REPORT ===\n") - f.write(f"Canonical pages: {len(output_canonical_pages)}\n") + f.write(f"Canonical pages: {len(canonical_pages)}\n") f.write(f"Equivalences: {len(equivalences)}\n") f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Problems: {len(problems)}\n\n") diff --git a/scan_internal_links.py b/scan_internal_links.py index e74680b..0858a57 100644 --- a/scan_internal_links.py +++ b/scan_internal_links.py @@ -3,27 +3,16 @@ import json import re from bs4 import BeautifulSoup from urllib.parse import urlparse, parse_qs, unquote -import unicodedata # -------------------------------------------------- -# CONFIG +# PATHS # -------------------------------------------------- -PAGES_DIR = Path("../output_ok/cleaned_pages") -REGISTRY_PATH = Path("../output_ok/equivalence_registry.json") -OUTPUT_DIR = Path("../output_ok/link_scan") +PAGES_DIR = Path("../output/pages") +REGISTRY_PATH = Path("../output/equivalence_registry.json") +OUTPUT_DIR = Path("../output/link_scan") OUTPUT_DIR.mkdir(exist_ok=True) -IGNORED_PREFIXES = ( - "file ", - "image ", - "category ", - "template ", - "special ", - "help ", - "user ", - "talk ", -) # -------------------------------------------------- # LOAD REGISTRY @@ -40,72 +29,92 @@ valid_targets = set(canonical_pages.values()) # HELPERS # -------------------------------------------------- -def normalize_title(title: str) -> str: +def normalize_title(title: str | None): if not title: - return - title = title.strip() + return None + title = unquote(title) - title = Path(title).stem - title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ") - title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') - title = re.sub(r"\s+", " ", title) + title = re.sub(r"\s+", " ", title.strip()) return title.casefold() + +# ------------------------- +# Extract MediaWiki target +# ------------------------- + def extract_mediawiki_target(href: str): + if not href: return None + + # ignore anchors if href.startswith("#"): return None + parsed = urlparse(href) + + # external link if parsed.scheme in ("http", "https"): return None + path = parsed.path or "" + + # /wiki/Page_Name if "/wiki/" in path: return path.split("/wiki/", 1)[1] + + # index.php?title=Page if "index.php" in path: qs = parse_qs(parsed.query) if "title" in qs: return qs["title"][0] + + # fallback filename-like return Path(path).stem +# ------------------------- +# Ignore unwanted namespaces +# ------------------------- + +IGNORED_PREFIXES = ( + "file:", + "image:", + "template:", + "special:", + "help:", + "user:", + "talk:", +) + def is_ignored_namespace(title_norm: str): return title_norm.startswith(IGNORED_PREFIXES) + +# ------------------------- +# Extract article content +# ------------------------- + def extract_article_links(soup): + content = soup.find("div", id="mw-content-text") if not content: return [] + links = [] + for a in content.select("a[href]"): + + # ignore navboxes / metadata if a.find_parent(class_="navbox"): continue - links.append({ - "href": a.get("href"), - "title": a.get("title"), - "text": a.get_text(strip=True), - }) + + href = a.get("href") + links.append(href) + return links -def resolve_link(raw_target, title_attr): - candidates = [] - if title_attr: - candidates.append(title_attr) - if raw_target: - candidates.append(raw_target) - for candidate in candidates: - norm = normalize_title(candidate) - if not norm: - continue - if is_ignored_namespace(norm): - return None, "ignored" - if norm in equivalences: - return equivalences[norm], "equivalence" - filename = norm.replace(" ", "_") + ".html" - if filename in valid_targets: - return filename, "direct" - return None, "unresolved" # -------------------------------------------------- # MAIN SCAN @@ -113,29 +122,43 @@ def resolve_link(raw_target, title_attr): resolved_links = [] unresolved_links = [] + files = list(PAGES_DIR.glob("*.html")) print(f"{len(files)} pages à analyser") for i, file_path in enumerate(files, 1): + html = file_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html, "html.parser") + links = extract_article_links(soup) - for link in links: - raw_target = extract_mediawiki_target(link["href"]) - resolved, method = resolve_link(raw_target, link["title"]) + + for href in links: + + raw_target = extract_mediawiki_target(href) + norm = normalize_title(raw_target) + + if not norm: + continue + + if is_ignored_namespace(norm): + continue + entry = { "source": file_path.name, - "href": link["href"], - "title": link["title"], - "method": method, + "href": href, + "normalized": norm, } + + resolved = equivalences.get(norm) + if resolved: - entry["resolved"] = resolved + entry["resolved_title"] = resolved resolved_links.append(entry) else: - entry["raw_target"] = raw_target unresolved_links.append(entry) - if i % 200 == 0: + + if i % 100 == 0: print(f"{i}/{len(files)} analysées") # --------------------------------------------------