first experiments
This commit is contained in:
commit
36c8bb2354
5 changed files with 542 additions and 0 deletions
101
remove_duplicate_pages.py
Normal file
101
remove_duplicate_pages.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
from pathlib import Path
|
||||
import shutil
|
||||
import re
|
||||
import json
|
||||
|
||||
|
||||
INPUT_DIR = Path(".")
|
||||
OUTPUT_DIR = Path("unique_pages")
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
|
||||
REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)')
|
||||
PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"')
|
||||
INVALID_CHARS = r'[<>:"/\\|?*]'
|
||||
RESERVED_NAMES = {
|
||||
"CON",
|
||||
"PRN",
|
||||
"AUX",
|
||||
"NUL",
|
||||
*(f"COM{i}" for i in range(1, 10)),
|
||||
*(f"LPT{i}" for i in range(1, 10)),
|
||||
}
|
||||
|
||||
|
||||
def sanitize_filename(name: str) -> str:
|
||||
# forbidden chars
|
||||
name = re.sub(INVALID_CHARS, "_", name)
|
||||
|
||||
# spaces → underscore
|
||||
name = name.replace(" ", "_")
|
||||
|
||||
# remove trailing dots/spaces
|
||||
name = name.rstrip(". ")
|
||||
|
||||
# Windows reserved names
|
||||
if name.upper() in RESERVED_NAMES:
|
||||
name = "_" + name
|
||||
|
||||
return name
|
||||
|
||||
|
||||
articles = {}
|
||||
print("start parsing files")
|
||||
|
||||
for file_path in INPUT_DIR.glob("*.html"):
|
||||
|
||||
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
|
||||
# Article ID
|
||||
m = ARTICLE_ID_RE.search(html)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
article_id = int(m.group(1))
|
||||
if article_id == 0:
|
||||
continue
|
||||
|
||||
# Redirect
|
||||
m = REDIRECT_RE.search(html)
|
||||
is_redirect = bool(m and m.group(1) == "true")
|
||||
|
||||
# Canonical page name
|
||||
m = PAGENAME_RE.search(html)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
# Decode MediaWiki unicode escapes
|
||||
raw_name = m.group(1)
|
||||
page_name = json.loads(f'"{raw_name}"')
|
||||
|
||||
# Sanitize filename
|
||||
clean_name = page_name.replace("Category:", "")
|
||||
clean_name = sanitize_filename(clean_name)
|
||||
|
||||
filename = clean_name + ".html"
|
||||
|
||||
# Selection logic
|
||||
if article_id not in articles:
|
||||
articles[article_id] = {
|
||||
"path": file_path,
|
||||
"redirect": is_redirect,
|
||||
"filename": filename,
|
||||
}
|
||||
else:
|
||||
# Avoid redirect
|
||||
if articles[article_id]["redirect"] and not is_redirect:
|
||||
articles[article_id] = {
|
||||
"path": file_path,
|
||||
"redirect": is_redirect,
|
||||
"filename": filename,
|
||||
}
|
||||
|
||||
# Copy
|
||||
print("start copying files")
|
||||
for art in articles.values():
|
||||
dst = OUTPUT_DIR / art["filename"]
|
||||
try:
|
||||
shutil.copy2(art["path"], dst)
|
||||
except OSError as e:
|
||||
print("❌ Copy failed:", art["filename"], e)
|
||||
print(f"✅ Unique pages kept: {len(articles)}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue