whu_migration_scripts/sort_pages.py

156 lines
3.5 KiB
Python

from pathlib import Path
import shutil
import re
import json
INPUT_DIR = Path("../unique_pages")
OUTPUT_DIR = Path("../classified_pages")
CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')
# ---------- CONFIG ----------
FACTIONS = {
"cygnar",
"khador",
"protectorate of menoth",
"cryx",
"retribution of scyrah",
"convergence of cyriss",
"cephalyx",
"mercenary",
"crucible guard",
"infernals",
"trollblood",
"circle orboros",
"skorne",
"legion of everblight",
"grymkin",
"minion",
}
MODEL_TYPES = {
"warcaster unit": "warcaster unit",
"warlock unit": "warlock unit",
"warcaster": "warcaster",
"warlock": "warlocks",
"infernal master": "masters",
"warjack": "warjacks",
"warbeast": "warbeasts",
"horror": "horrors",
"monstrosity": "monstrosities",
"solo": "solos",
"unit": "units",
"battle engine": "battle engines",
"structure": "structure",
}
# ---------- helpers ----------
def extract_categories(html: str):
m = CATEGORY_RE.search(html)
if not m:
return []
raw = "[" + m.group(1) + "]"
try:
return [c.lower() for c in json.loads(raw)]
except:
return []
def detect_faction(categories):
for c in categories:
if c in FACTIONS:
return c
return None
def detect_model_type(categories):
for c, folder in MODEL_TYPES.items():
if c in categories:
return folder
return None
# ---------- classification ----------
def classify(categories):
# SPELLS
if "spell" in categories:
return Path("common/warmahordes rules/spells")
if "animus" in categories:
return Path("common/warmahordes rules/animi")
if "model ability" in categories:
return Path("common/warmahordes rules/model advantages")
if "weapon ability" in categories:
return Path("common/warmahordes rules/weapon qualities")
# MODELS
if "model" in categories:
faction = detect_faction(categories)
model_type = detect_model_type(categories)
if faction and model_type:
system = (
"warmachine"
if faction
not in {
"trollblood",
"circle orboros",
"skorne",
"legion of everblight",
"grymkin",
"minion",
}
else "hordes"
)
return Path(f"{system}/{faction}/{model_type}")
# THEME FORCE
if "theme force" in categories:
faction = detect_faction(categories)
if faction:
system = (
"warmachine"
if faction
not in {
"trollblood",
"circle orboros",
"skorne",
"legion of everblight",
"grymkin",
"minion",
}
else "hordes"
)
return Path(f"{system}/{faction}/theme forces")
# fallback
return Path("common/others")
# ---------- main ----------
files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} pages à classifier")
for file_path in files:
html = file_path.read_text(encoding="utf-8", errors="ignore")
categories = extract_categories(html)
dest = OUTPUT_DIR / classify(categories)
dest.mkdir(parents=True, exist_ok=True)
shutil.copy2(file_path, dest / file_path.name)
print("✅ Classification terminée")