From 186492de859e069c81e823e7792f57057beab6cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Thu, 16 Apr 2026 10:04:58 +0200 Subject: [PATCH] WIP --- boostack_create_pages.py | 89 +++++++++++++++++++++++++++++ scan_internal_links.py | 120 ++++++++++++++++----------------------- 2 files changed, 137 insertions(+), 72 deletions(-) create mode 100644 boostack_create_pages.py diff --git a/boostack_create_pages.py b/boostack_create_pages.py new file mode 100644 index 0000000..0dd4f23 --- /dev/null +++ b/boostack_create_pages.py @@ -0,0 +1,89 @@ +import requests +import sys + +# ========================== +# CONFIGURATION +# ========================== + +BOOKSTACK_API_URL = "https://wiki-warmachine.ungol.fr/api" +API_TOKEN_ID = "VOTRE_TOKEN_ID" +API_TOKEN_SECRET = "VOTRE_TOKEN_SECRET" + +PAGES_FILE = "pages.txt" + +DEFAULT_CONTENT = """ +

Page restaurée automatiquement depuis l'ancien wiki.

+""" + +# ========================== +# HEADERS +# ========================== + +HEADERS = { + "Authorization": f"Token {API_TOKEN_ID}:{API_TOKEN_SECRET}", + "Content-Type": "application/json" +} + +# ========================== +# FUNCTIONS +# ========================== + +def create_page(title, chapter_id, content=DEFAULT_CONTENT): + """Create a page in BookStack""" + url = f"{BOOKSTACK_API_URL}/pages" + + payload = { + "name": title, + "html": content, + "chapter_id": int(chapter_id) + } + + response = requests.post(url, headers=HEADERS, json=payload) + + if response.status_code == 200: + page_id = response.json().get("id") + print(f"[OK] Page créée : '{title}' (ID {page_id})") + return page_id + else: + print(f"[ERREUR] Impossible de créer '{title}'") + print(response.status_code, response.text) + return None + + +def load_pages(filename): + """Load pages list from file""" + pages = [] + with open(filename, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + try: + title, chapter_id = line.split("|") + pages.append((title.strip(), chapter_id.strip())) + except ValueError: + print(f"[IGNORÉ] Ligne invalide : {line}") + return pages + + +# ========================== +# MAIN +# ========================== + +def main(): + pages = load_pages(PAGES_FILE) + + if not pages: + print("Aucune page à créer.") + sys.exit(0) + + print(f"{len(pages)} pages à créer...\n") + + for title, chapter_id in pages: + create_page(title, chapter_id) + + print("\nImport terminé.") + + +if __name__ == "__main__": + main() diff --git a/scan_internal_links.py b/scan_internal_links.py index 385e9a7..e74680b 100644 --- a/scan_internal_links.py +++ b/scan_internal_links.py @@ -6,14 +6,24 @@ from urllib.parse import urlparse, parse_qs, unquote import unicodedata # -------------------------------------------------- -# PATHS +# CONFIG # -------------------------------------------------- -PAGES_DIR = Path("../output/cleaned_pages") -REGISTRY_PATH = Path("../output/equivalence_registry.json") -OUTPUT_DIR = Path("../output/link_scan") +PAGES_DIR = Path("../output_ok/cleaned_pages") +REGISTRY_PATH = Path("../output_ok/equivalence_registry.json") +OUTPUT_DIR = Path("../output_ok/link_scan") OUTPUT_DIR.mkdir(exist_ok=True) +IGNORED_PREFIXES = ( + "file ", + "image ", + "category ", + "template ", + "special ", + "help ", + "user ", + "talk ", +) # -------------------------------------------------- # LOAD REGISTRY @@ -34,88 +44,68 @@ def normalize_title(title: str) -> str: if not title: return title = title.strip() + title = unquote(title) + title = Path(title).stem title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ") title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') title = re.sub(r"\s+", " ", title) return title.casefold() -# ------------------------- -# Extract MediaWiki target -# ------------------------- - def extract_mediawiki_target(href: str): - if not href: return None - - # ignore anchors if href.startswith("#"): return None - parsed = urlparse(href) - - # external link if parsed.scheme in ("http", "https"): return None - path = parsed.path or "" - - # /wiki/Page_Name if "/wiki/" in path: return path.split("/wiki/", 1)[1] - - # index.php?title=Page if "index.php" in path: qs = parse_qs(parsed.query) if "title" in qs: return qs["title"][0] - - # fallback filename-like return Path(path).stem -# ------------------------- -# Ignore unwanted namespaces -# ------------------------- - -IGNORED_PREFIXES = ( - "file:", - "image:", - "template:", - "special:", - "help:", - "user:", - "talk:", -) - def is_ignored_namespace(title_norm: str): return title_norm.startswith(IGNORED_PREFIXES) - -# ------------------------- -# Extract article content -# ------------------------- - def extract_article_links(soup): - content = soup.find("div", id="mw-content-text") if not content: return [] - links = [] - for a in content.select("a[href]"): - - # ignore navboxes / metadata if a.find_parent(class_="navbox"): continue - - href = a.get("href") - links.append(href) - + links.append({ + "href": a.get("href"), + "title": a.get("title"), + "text": a.get_text(strip=True), + }) return links +def resolve_link(raw_target, title_attr): + candidates = [] + if title_attr: + candidates.append(title_attr) + if raw_target: + candidates.append(raw_target) + for candidate in candidates: + norm = normalize_title(candidate) + if not norm: + continue + if is_ignored_namespace(norm): + return None, "ignored" + if norm in equivalences: + return equivalences[norm], "equivalence" + filename = norm.replace(" ", "_") + ".html" + if filename in valid_targets: + return filename, "direct" + return None, "unresolved" # -------------------------------------------------- # MAIN SCAN @@ -123,43 +113,29 @@ def extract_article_links(soup): resolved_links = [] unresolved_links = [] - files = list(PAGES_DIR.glob("*.html")) print(f"{len(files)} pages à analyser") for i, file_path in enumerate(files, 1): - html = file_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html, "html.parser") - links = extract_article_links(soup) - - for href in links: - - raw_target = extract_mediawiki_target(href) - norm = normalize_title(raw_target) - - if not norm: - continue - - if is_ignored_namespace(norm): - continue - + for link in links: + raw_target = extract_mediawiki_target(link["href"]) + resolved, method = resolve_link(raw_target, link["title"]) entry = { "source": file_path.name, - "href": href, - "normalized": norm, + "href": link["href"], + "title": link["title"], + "method": method, } - - resolved = equivalences.get(norm) - if resolved: - entry["resolved_title"] = resolved + entry["resolved"] = resolved resolved_links.append(entry) else: + entry["raw_target"] = raw_target unresolved_links.append(entry) - - if i % 100 == 0: + if i % 200 == 0: print(f"{i}/{len(files)} analysées") # --------------------------------------------------