from pathlib import Path import shutil import re import json INPUT_DIR = Path(".") OUTPUT_DIR = Path("unique_pages") OUTPUT_DIR.mkdir(exist_ok=True) ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)') REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)') PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"') INVALID_CHARS = r'[<>:"/\\|?*]' RESERVED_NAMES = { "CON", "PRN", "AUX", "NUL", *(f"COM{i}" for i in range(1, 10)), *(f"LPT{i}" for i in range(1, 10)), } def sanitize_filename(name: str) -> str: # forbidden chars name = re.sub(INVALID_CHARS, "_", name) # spaces → underscore name = name.replace(" ", "_") # remove trailing dots/spaces name = name.rstrip(". ") # Windows reserved names if name.upper() in RESERVED_NAMES: name = "_" + name return name articles = {} print("start parsing files") for file_path in INPUT_DIR.glob("*.html"): html = file_path.read_text(encoding="utf-8", errors="ignore") # Article ID m = ARTICLE_ID_RE.search(html) if not m: continue article_id = int(m.group(1)) if article_id == 0: continue # Redirect m = REDIRECT_RE.search(html) is_redirect = bool(m and m.group(1) == "true") # Canonical page name m = PAGENAME_RE.search(html) if not m: continue # Decode MediaWiki unicode escapes raw_name = m.group(1) page_name = json.loads(f'"{raw_name}"') # Sanitize filename clean_name = page_name.replace("Category:", "") clean_name = sanitize_filename(clean_name) filename = clean_name + ".html" # Selection logic if article_id not in articles: articles[article_id] = { "path": file_path, "redirect": is_redirect, "filename": filename, } else: # Avoid redirect if articles[article_id]["redirect"] and not is_redirect: articles[article_id] = { "path": file_path, "redirect": is_redirect, "filename": filename, } # Copy print("start copying files") for art in articles.values(): dst = OUTPUT_DIR / art["filename"] try: shutil.copy2(art["path"], dst) except OSError as e: print("❌ Copy failed:", art["filename"], e) print(f"✅ Unique pages kept: {len(articles)}")