102 lines
2.3 KiB
Python
102 lines
2.3 KiB
Python
|
|
from pathlib import Path
|
||
|
|
import shutil
|
||
|
|
import re
|
||
|
|
import json
|
||
|
|
|
||
|
|
|
||
|
|
INPUT_DIR = Path(".")
|
||
|
|
OUTPUT_DIR = Path("unique_pages")
|
||
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
||
|
|
|
||
|
|
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
|
||
|
|
REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)')
|
||
|
|
PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"')
|
||
|
|
INVALID_CHARS = r'[<>:"/\\|?*]'
|
||
|
|
RESERVED_NAMES = {
|
||
|
|
"CON",
|
||
|
|
"PRN",
|
||
|
|
"AUX",
|
||
|
|
"NUL",
|
||
|
|
*(f"COM{i}" for i in range(1, 10)),
|
||
|
|
*(f"LPT{i}" for i in range(1, 10)),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def sanitize_filename(name: str) -> str:
|
||
|
|
# forbidden chars
|
||
|
|
name = re.sub(INVALID_CHARS, "_", name)
|
||
|
|
|
||
|
|
# spaces → underscore
|
||
|
|
name = name.replace(" ", "_")
|
||
|
|
|
||
|
|
# remove trailing dots/spaces
|
||
|
|
name = name.rstrip(". ")
|
||
|
|
|
||
|
|
# Windows reserved names
|
||
|
|
if name.upper() in RESERVED_NAMES:
|
||
|
|
name = "_" + name
|
||
|
|
|
||
|
|
return name
|
||
|
|
|
||
|
|
|
||
|
|
articles = {}
|
||
|
|
print("start parsing files")
|
||
|
|
|
||
|
|
for file_path in INPUT_DIR.glob("*.html"):
|
||
|
|
|
||
|
|
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||
|
|
|
||
|
|
# Article ID
|
||
|
|
m = ARTICLE_ID_RE.search(html)
|
||
|
|
if not m:
|
||
|
|
continue
|
||
|
|
|
||
|
|
article_id = int(m.group(1))
|
||
|
|
if article_id == 0:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Redirect
|
||
|
|
m = REDIRECT_RE.search(html)
|
||
|
|
is_redirect = bool(m and m.group(1) == "true")
|
||
|
|
|
||
|
|
# Canonical page name
|
||
|
|
m = PAGENAME_RE.search(html)
|
||
|
|
if not m:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Decode MediaWiki unicode escapes
|
||
|
|
raw_name = m.group(1)
|
||
|
|
page_name = json.loads(f'"{raw_name}"')
|
||
|
|
|
||
|
|
# Sanitize filename
|
||
|
|
clean_name = page_name.replace("Category:", "")
|
||
|
|
clean_name = sanitize_filename(clean_name)
|
||
|
|
|
||
|
|
filename = clean_name + ".html"
|
||
|
|
|
||
|
|
# Selection logic
|
||
|
|
if article_id not in articles:
|
||
|
|
articles[article_id] = {
|
||
|
|
"path": file_path,
|
||
|
|
"redirect": is_redirect,
|
||
|
|
"filename": filename,
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
# Avoid redirect
|
||
|
|
if articles[article_id]["redirect"] and not is_redirect:
|
||
|
|
articles[article_id] = {
|
||
|
|
"path": file_path,
|
||
|
|
"redirect": is_redirect,
|
||
|
|
"filename": filename,
|
||
|
|
}
|
||
|
|
|
||
|
|
# Copy
|
||
|
|
print("start copying files")
|
||
|
|
for art in articles.values():
|
||
|
|
dst = OUTPUT_DIR / art["filename"]
|
||
|
|
try:
|
||
|
|
shutil.copy2(art["path"], dst)
|
||
|
|
except OSError as e:
|
||
|
|
print("❌ Copy failed:", art["filename"], e)
|
||
|
|
print(f"✅ Unique pages kept: {len(articles)}")
|