whu_migration_scripts/remove_duplicate_pages.py

102 lines
2.3 KiB
Python
Raw Normal View History

2026-04-03 07:56:40 +02:00
from pathlib import Path
import shutil
import re
import json
INPUT_DIR = Path(".")
OUTPUT_DIR = Path("unique_pages")
OUTPUT_DIR.mkdir(exist_ok=True)
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)')
PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"')
INVALID_CHARS = r'[<>:"/\\|?*]'
RESERVED_NAMES = {
"CON",
"PRN",
"AUX",
"NUL",
*(f"COM{i}" for i in range(1, 10)),
*(f"LPT{i}" for i in range(1, 10)),
}
def sanitize_filename(name: str) -> str:
# forbidden chars
name = re.sub(INVALID_CHARS, "_", name)
# spaces → underscore
name = name.replace(" ", "_")
# remove trailing dots/spaces
name = name.rstrip(". ")
# Windows reserved names
if name.upper() in RESERVED_NAMES:
name = "_" + name
return name
articles = {}
print("start parsing files")
for file_path in INPUT_DIR.glob("*.html"):
html = file_path.read_text(encoding="utf-8", errors="ignore")
# Article ID
m = ARTICLE_ID_RE.search(html)
if not m:
continue
article_id = int(m.group(1))
if article_id == 0:
continue
# Redirect
m = REDIRECT_RE.search(html)
is_redirect = bool(m and m.group(1) == "true")
# Canonical page name
m = PAGENAME_RE.search(html)
if not m:
continue
# Decode MediaWiki unicode escapes
raw_name = m.group(1)
page_name = json.loads(f'"{raw_name}"')
# Sanitize filename
clean_name = page_name.replace("Category:", "")
clean_name = sanitize_filename(clean_name)
filename = clean_name + ".html"
# Selection logic
if article_id not in articles:
articles[article_id] = {
"path": file_path,
"redirect": is_redirect,
"filename": filename,
}
else:
# Avoid redirect
if articles[article_id]["redirect"] and not is_redirect:
articles[article_id] = {
"path": file_path,
"redirect": is_redirect,
"filename": filename,
}
# Copy
print("start copying files")
for art in articles.values():
dst = OUTPUT_DIR / art["filename"]
try:
shutil.copy2(art["path"], dst)
except OSError as e:
print("❌ Copy failed:", art["filename"], e)
print(f"✅ Unique pages kept: {len(articles)}")