Compare commits
3 commits
4e473ba2c9
...
186492de85
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
186492de85 | ||
|
|
61d7f6b646 | ||
|
|
8e9289998b |
4 changed files with 216 additions and 94 deletions
89
boostack_create_pages.py
Normal file
89
boostack_create_pages.py
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# ==========================
|
||||||
|
# CONFIGURATION
|
||||||
|
# ==========================
|
||||||
|
|
||||||
|
BOOKSTACK_API_URL = "https://wiki-warmachine.ungol.fr/api"
|
||||||
|
API_TOKEN_ID = "VOTRE_TOKEN_ID"
|
||||||
|
API_TOKEN_SECRET = "VOTRE_TOKEN_SECRET"
|
||||||
|
|
||||||
|
PAGES_FILE = "pages.txt"
|
||||||
|
|
||||||
|
DEFAULT_CONTENT = """
|
||||||
|
<p><em>Page restaurée automatiquement depuis l'ancien wiki.</em></p>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ==========================
|
||||||
|
# HEADERS
|
||||||
|
# ==========================
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"Authorization": f"Token {API_TOKEN_ID}:{API_TOKEN_SECRET}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ==========================
|
||||||
|
# FUNCTIONS
|
||||||
|
# ==========================
|
||||||
|
|
||||||
|
def create_page(title, chapter_id, content=DEFAULT_CONTENT):
|
||||||
|
"""Create a page in BookStack"""
|
||||||
|
url = f"{BOOKSTACK_API_URL}/pages"
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"name": title,
|
||||||
|
"html": content,
|
||||||
|
"chapter_id": int(chapter_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(url, headers=HEADERS, json=payload)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
page_id = response.json().get("id")
|
||||||
|
print(f"[OK] Page créée : '{title}' (ID {page_id})")
|
||||||
|
return page_id
|
||||||
|
else:
|
||||||
|
print(f"[ERREUR] Impossible de créer '{title}'")
|
||||||
|
print(response.status_code, response.text)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def load_pages(filename):
|
||||||
|
"""Load pages list from file"""
|
||||||
|
pages = []
|
||||||
|
with open(filename, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line or line.startswith("#"):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
title, chapter_id = line.split("|")
|
||||||
|
pages.append((title.strip(), chapter_id.strip()))
|
||||||
|
except ValueError:
|
||||||
|
print(f"[IGNORÉ] Ligne invalide : {line}")
|
||||||
|
return pages
|
||||||
|
|
||||||
|
|
||||||
|
# ==========================
|
||||||
|
# MAIN
|
||||||
|
# ==========================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pages = load_pages(PAGES_FILE)
|
||||||
|
|
||||||
|
if not pages:
|
||||||
|
print("Aucune page à créer.")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
print(f"{len(pages)} pages à créer...\n")
|
||||||
|
|
||||||
|
for title, chapter_id in pages:
|
||||||
|
create_page(title, chapter_id)
|
||||||
|
|
||||||
|
print("\nImport terminé.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -51,6 +51,30 @@ STRIP_ATTRIBUTES = [
|
||||||
"border",
|
"border",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ERROR_PAGE_PATTERNS = [
|
||||||
|
# 5xx
|
||||||
|
"503 service unavailable",
|
||||||
|
"502 bad gateway",
|
||||||
|
"500 internal server error",
|
||||||
|
"504 gateway time",
|
||||||
|
|
||||||
|
# 4xx
|
||||||
|
"400 bad request",
|
||||||
|
"401 unauthorized",
|
||||||
|
"403 forbidden",
|
||||||
|
"404 not found",
|
||||||
|
"408 request time",
|
||||||
|
"419 page expired",
|
||||||
|
"429 too many requests",
|
||||||
|
|
||||||
|
# génériques
|
||||||
|
"temporarily busy",
|
||||||
|
"server error",
|
||||||
|
"internal error",
|
||||||
|
"page not found",
|
||||||
|
"request could not be satisfied",
|
||||||
|
]
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
# HELPERS
|
# HELPERS
|
||||||
# ======================
|
# ======================
|
||||||
|
|
@ -238,13 +262,32 @@ def remove_intro_rule_box(content):
|
||||||
if getattr(el, "name", None) == "p":
|
if getattr(el, "name", None) == "p":
|
||||||
break
|
break
|
||||||
|
|
||||||
|
def is_error_page(soup: BeautifulSoup) -> bool:
|
||||||
|
text = soup.get_text(" ", strip=True).lower()
|
||||||
|
return any(p in text for p in ERROR_PAGE_PATTERNS)
|
||||||
|
|
||||||
|
def build_fallback_html(title: str, filename: str) -> str:
|
||||||
|
safe_title = title or filename.replace("_", " ").replace(".html", "")
|
||||||
|
|
||||||
|
return f"""<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>{safe_title}</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>{safe_title}</h1>
|
||||||
|
<p>Lost content (HTTracker) in {filename}</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
# CORE FUNCTIONS
|
# CORE FUNCTIONS
|
||||||
# ======================
|
# ======================
|
||||||
|
|
||||||
def clean_html_file(input_path: Path, output_path: Path):
|
def clean_html_file(input_path: Path, output_path: Path):
|
||||||
html = input_path.read_text(encoding="utf-8", errors="ignore")
|
html_page = input_path.read_text(encoding="utf-8", errors="ignore")
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html_page, "html.parser")
|
||||||
|
|
||||||
# Remove comments (HTTrack etc.)
|
# Remove comments (HTTrack etc.)
|
||||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||||
|
|
@ -255,11 +298,15 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
for el in soup.find_all(tag):
|
for el in soup.find_all(tag):
|
||||||
el.decompose()
|
el.decompose()
|
||||||
|
|
||||||
|
|
||||||
# Extract main content
|
# Extract main content
|
||||||
content = soup.select_one("#mw-content-text")
|
content = soup.select_one("#mw-content-text")
|
||||||
if not content:
|
if not content or is_error_page(soup):
|
||||||
print(f"[WARN] No content in {input_path.name}")
|
print(f"[WARN] No content in {input_path.name}")
|
||||||
|
fallback = build_fallback_html(
|
||||||
|
title=soup.title.get_text(strip=True) if soup.title else "",
|
||||||
|
filename=input_path.name
|
||||||
|
)
|
||||||
|
output_path.write_text(fallback, encoding="utf-8")
|
||||||
return
|
return
|
||||||
remove_intro_rule_box(content)
|
remove_intro_rule_box(content)
|
||||||
|
|
||||||
|
|
@ -362,11 +409,14 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
|
|
||||||
def process_all():
|
def process_all():
|
||||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
files = list(SOURCE_DIR.glob("*.html"))
|
||||||
for file in SOURCE_DIR.glob("*.html"):
|
total = len(files)
|
||||||
|
print(f"{total} fichiers trouvés")
|
||||||
|
for i, file in enumerate(files, start=1):
|
||||||
output_file = OUTPUT_DIR / file.name
|
output_file = OUTPUT_DIR / file.name
|
||||||
clean_html_file(file, output_file)
|
clean_html_file(file, output_file)
|
||||||
|
if i % 200 == 0 or i == total:
|
||||||
|
print(f"{i}/{total} analysés ({i/total:.1%})")
|
||||||
print("✅ Cleaning complete")
|
print("✅ Cleaning complete")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ from difflib import SequenceMatcher
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
SOURCE_DIR = Path("../test")
|
SOURCE_DIR = Path("../original_index")
|
||||||
OUTPUT_DIR = Path("../output")
|
OUTPUT_DIR = Path("../output")
|
||||||
|
|
||||||
PAGES_DIR = Path(OUTPUT_DIR / "pages")
|
PAGES_DIR = Path(OUTPUT_DIR / "pages")
|
||||||
|
|
@ -535,34 +535,40 @@ def title_to_filename(title: str) -> str:
|
||||||
title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html"
|
title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
output_canonical_pages = {}
|
||||||
|
name_registry = {}
|
||||||
copied = 0
|
copied = 0
|
||||||
|
collision = 0
|
||||||
total = len(canonical_pages)
|
total = len(canonical_pages)
|
||||||
|
|
||||||
for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
|
for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
|
||||||
|
|
||||||
src = data["path"]
|
src = data["path"]
|
||||||
dst_name = title_to_filename(data["title"])
|
base_name = title_to_filename(data["title"])
|
||||||
dst = PAGES_DIR / dst_name
|
if base_name in name_registry:
|
||||||
|
base_name = Path(base_name).stem
|
||||||
|
base_name = f"{base_name}__{article_id}.html"
|
||||||
|
collision += 1
|
||||||
|
problems.append(f"Resolved collision: {base_name} (from {src})")
|
||||||
|
name_registry[base_name] = article_id
|
||||||
|
dst = PAGES_DIR / base_name
|
||||||
try:
|
try:
|
||||||
shutil.copy2(src, dst)
|
shutil.copy2(src, dst)
|
||||||
canonical_pages[article_id] = dst_name
|
output_canonical_pages[article_id] = base_name
|
||||||
copied += 1
|
copied += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
problems.append(f"Copy failed {src}: {e}")
|
problems.append(f"Copy failed {src}: {e}")
|
||||||
|
|
||||||
if i % 200 == 0 or i == total:
|
if i % 200 == 0 or i == total:
|
||||||
print(f"{i}/{total} copiés")
|
print(f"{i}/{total} copiés")
|
||||||
|
|
||||||
print(f"{copied} pages copiées")
|
print(f"{copied} pages copiées")
|
||||||
|
print(f"{collision} collisions détectées")
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# SAVE REGISTRY
|
# SAVE REGISTRY
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
registry = {
|
registry = {
|
||||||
"canonical_pages": canonical_pages,
|
"canonical_pages": output_canonical_pages,
|
||||||
"equivalences": equivalences,
|
"equivalences": equivalences,
|
||||||
"potential_tags": potential_tags,
|
"potential_tags": potential_tags,
|
||||||
"ignored_pages": ignored_pages,
|
"ignored_pages": ignored_pages,
|
||||||
|
|
@ -579,7 +585,7 @@ with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
|
||||||
|
|
||||||
with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
||||||
f.write("=== MIGRATION REPORT ===\n")
|
f.write("=== MIGRATION REPORT ===\n")
|
||||||
f.write(f"Canonical pages: {len(canonical_pages)}\n")
|
f.write(f"Canonical pages: {len(output_canonical_pages)}\n")
|
||||||
f.write(f"Equivalences: {len(equivalences)}\n")
|
f.write(f"Equivalences: {len(equivalences)}\n")
|
||||||
f.write(f"Ignored: {len(ignored_pages)}\n")
|
f.write(f"Ignored: {len(ignored_pages)}\n")
|
||||||
f.write(f"Problems: {len(problems)}\n\n")
|
f.write(f"Problems: {len(problems)}\n\n")
|
||||||
|
|
|
||||||
|
|
@ -3,16 +3,27 @@ import json
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse, parse_qs, unquote
|
from urllib.parse import urlparse, parse_qs, unquote
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PATHS
|
# CONFIG
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
PAGES_DIR = Path("../output/pages")
|
PAGES_DIR = Path("../output_ok/cleaned_pages")
|
||||||
REGISTRY_PATH = Path("../output/equivalence_registry.json")
|
REGISTRY_PATH = Path("../output_ok/equivalence_registry.json")
|
||||||
OUTPUT_DIR = Path("../output/link_scan")
|
OUTPUT_DIR = Path("../output_ok/link_scan")
|
||||||
|
|
||||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||||
|
IGNORED_PREFIXES = (
|
||||||
|
"file ",
|
||||||
|
"image ",
|
||||||
|
"category ",
|
||||||
|
"template ",
|
||||||
|
"special ",
|
||||||
|
"help ",
|
||||||
|
"user ",
|
||||||
|
"talk ",
|
||||||
|
)
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# LOAD REGISTRY
|
# LOAD REGISTRY
|
||||||
|
|
@ -29,92 +40,72 @@ valid_targets = set(canonical_pages.values())
|
||||||
# HELPERS
|
# HELPERS
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
def normalize_title(title: str | None):
|
def normalize_title(title: str) -> str:
|
||||||
if not title:
|
if not title:
|
||||||
return None
|
return
|
||||||
|
title = title.strip()
|
||||||
title = unquote(title)
|
title = unquote(title)
|
||||||
|
title = Path(title).stem
|
||||||
|
title = unicodedata.normalize("NFKC", title)
|
||||||
title = title.replace("_", " ")
|
title = title.replace("_", " ")
|
||||||
title = re.sub(r"\s+", " ", title.strip())
|
title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
||||||
|
title = re.sub(r"\s+", " ", title)
|
||||||
return title.casefold()
|
return title.casefold()
|
||||||
|
|
||||||
|
|
||||||
# -------------------------
|
|
||||||
# Extract MediaWiki target
|
|
||||||
# -------------------------
|
|
||||||
|
|
||||||
def extract_mediawiki_target(href: str):
|
def extract_mediawiki_target(href: str):
|
||||||
|
|
||||||
if not href:
|
if not href:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# ignore anchors
|
|
||||||
if href.startswith("#"):
|
if href.startswith("#"):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
parsed = urlparse(href)
|
parsed = urlparse(href)
|
||||||
|
|
||||||
# external link
|
|
||||||
if parsed.scheme in ("http", "https"):
|
if parsed.scheme in ("http", "https"):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
path = parsed.path or ""
|
path = parsed.path or ""
|
||||||
|
|
||||||
# /wiki/Page_Name
|
|
||||||
if "/wiki/" in path:
|
if "/wiki/" in path:
|
||||||
return path.split("/wiki/", 1)[1]
|
return path.split("/wiki/", 1)[1]
|
||||||
|
|
||||||
# index.php?title=Page
|
|
||||||
if "index.php" in path:
|
if "index.php" in path:
|
||||||
qs = parse_qs(parsed.query)
|
qs = parse_qs(parsed.query)
|
||||||
if "title" in qs:
|
if "title" in qs:
|
||||||
return qs["title"][0]
|
return qs["title"][0]
|
||||||
|
|
||||||
# fallback filename-like
|
|
||||||
return Path(path).stem
|
return Path(path).stem
|
||||||
|
|
||||||
|
|
||||||
# -------------------------
|
|
||||||
# Ignore unwanted namespaces
|
|
||||||
# -------------------------
|
|
||||||
|
|
||||||
IGNORED_PREFIXES = (
|
|
||||||
"file:",
|
|
||||||
"image:",
|
|
||||||
"template:",
|
|
||||||
"special:",
|
|
||||||
"help:",
|
|
||||||
"user:",
|
|
||||||
"talk:",
|
|
||||||
)
|
|
||||||
|
|
||||||
def is_ignored_namespace(title_norm: str):
|
def is_ignored_namespace(title_norm: str):
|
||||||
return title_norm.startswith(IGNORED_PREFIXES)
|
return title_norm.startswith(IGNORED_PREFIXES)
|
||||||
|
|
||||||
|
|
||||||
# -------------------------
|
|
||||||
# Extract article content
|
|
||||||
# -------------------------
|
|
||||||
|
|
||||||
def extract_article_links(soup):
|
def extract_article_links(soup):
|
||||||
|
|
||||||
content = soup.find("div", id="mw-content-text")
|
content = soup.find("div", id="mw-content-text")
|
||||||
if not content:
|
if not content:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
links = []
|
links = []
|
||||||
|
|
||||||
for a in content.select("a[href]"):
|
for a in content.select("a[href]"):
|
||||||
|
|
||||||
# ignore navboxes / metadata
|
|
||||||
if a.find_parent(class_="navbox"):
|
if a.find_parent(class_="navbox"):
|
||||||
continue
|
continue
|
||||||
|
links.append({
|
||||||
href = a.get("href")
|
"href": a.get("href"),
|
||||||
links.append(href)
|
"title": a.get("title"),
|
||||||
|
"text": a.get_text(strip=True),
|
||||||
|
})
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
def resolve_link(raw_target, title_attr):
|
||||||
|
candidates = []
|
||||||
|
if title_attr:
|
||||||
|
candidates.append(title_attr)
|
||||||
|
if raw_target:
|
||||||
|
candidates.append(raw_target)
|
||||||
|
for candidate in candidates:
|
||||||
|
norm = normalize_title(candidate)
|
||||||
|
if not norm:
|
||||||
|
continue
|
||||||
|
if is_ignored_namespace(norm):
|
||||||
|
return None, "ignored"
|
||||||
|
if norm in equivalences:
|
||||||
|
return equivalences[norm], "equivalence"
|
||||||
|
filename = norm.replace(" ", "_") + ".html"
|
||||||
|
if filename in valid_targets:
|
||||||
|
return filename, "direct"
|
||||||
|
return None, "unresolved"
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# MAIN SCAN
|
# MAIN SCAN
|
||||||
|
|
@ -122,43 +113,29 @@ def extract_article_links(soup):
|
||||||
|
|
||||||
resolved_links = []
|
resolved_links = []
|
||||||
unresolved_links = []
|
unresolved_links = []
|
||||||
|
|
||||||
files = list(PAGES_DIR.glob("*.html"))
|
files = list(PAGES_DIR.glob("*.html"))
|
||||||
print(f"{len(files)} pages à analyser")
|
print(f"{len(files)} pages à analyser")
|
||||||
|
|
||||||
for i, file_path in enumerate(files, 1):
|
for i, file_path in enumerate(files, 1):
|
||||||
|
|
||||||
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
links = extract_article_links(soup)
|
links = extract_article_links(soup)
|
||||||
|
for link in links:
|
||||||
for href in links:
|
raw_target = extract_mediawiki_target(link["href"])
|
||||||
|
resolved, method = resolve_link(raw_target, link["title"])
|
||||||
raw_target = extract_mediawiki_target(href)
|
|
||||||
norm = normalize_title(raw_target)
|
|
||||||
|
|
||||||
if not norm:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if is_ignored_namespace(norm):
|
|
||||||
continue
|
|
||||||
|
|
||||||
entry = {
|
entry = {
|
||||||
"source": file_path.name,
|
"source": file_path.name,
|
||||||
"href": href,
|
"href": link["href"],
|
||||||
"normalized": norm,
|
"title": link["title"],
|
||||||
|
"method": method,
|
||||||
}
|
}
|
||||||
|
|
||||||
resolved = equivalences.get(norm)
|
|
||||||
|
|
||||||
if resolved:
|
if resolved:
|
||||||
entry["resolved_title"] = resolved
|
entry["resolved"] = resolved
|
||||||
resolved_links.append(entry)
|
resolved_links.append(entry)
|
||||||
else:
|
else:
|
||||||
|
entry["raw_target"] = raw_target
|
||||||
unresolved_links.append(entry)
|
unresolved_links.append(entry)
|
||||||
|
if i % 200 == 0:
|
||||||
if i % 100 == 0:
|
|
||||||
print(f"{i}/{len(files)} analysées")
|
print(f"{i}/{len(files)} analysées")
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue