from pathlib import Path import shutil import re import json INPUT_DIR = Path("../unique_pages") OUTPUT_DIR = Path("../classified_pages") CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]') # ---------- CONFIG ---------- FACTIONS = { "cygnar", "khador", "protectorate of menoth", "cryx", "retribution of scyrah", "convergence of cyriss", "cephalyx", "mercenary", "crucible guard", "infernals", "trollblood", "circle orboros", "skorne", "legion of everblight", "grymkin", "minion", } MODEL_TYPES = { "warcaster unit": "warcaster unit", "warlock unit": "warlock unit", "warcaster": "warcaster", "warlock": "warlocks", "infernal master": "masters", "warjack": "warjacks", "warbeast": "warbeasts", "horror": "horrors", "monstrosity": "monstrosities", "solo": "solos", "unit": "units", "battle engine": "battle engines", "structure": "structure", } # ---------- helpers ---------- def extract_categories(html: str): m = CATEGORY_RE.search(html) if not m: return [] raw = "[" + m.group(1) + "]" try: return [c.lower() for c in json.loads(raw)] except: return [] def detect_faction(categories): for c in categories: if c in FACTIONS: return c return None def detect_model_type(categories): for c, folder in MODEL_TYPES.items(): if c in categories: return folder return None # ---------- classification ---------- def classify(categories): # SPELLS if "spell" in categories: return Path("common/warmahordes rules/spells") if "animus" in categories: return Path("common/warmahordes rules/animi") if "model ability" in categories: return Path("common/warmahordes rules/model advantages") if "weapon ability" in categories: return Path("common/warmahordes rules/weapon qualities") # MODELS if "model" in categories: faction = detect_faction(categories) model_type = detect_model_type(categories) if faction and model_type: system = ( "warmachine" if faction not in { "trollblood", "circle orboros", "skorne", "legion of everblight", "grymkin", "minion", } else "hordes" ) return Path(f"{system}/{faction}/{model_type}") # THEME FORCE if "theme force" in categories: faction = detect_faction(categories) if faction: system = ( "warmachine" if faction not in { "trollblood", "circle orboros", "skorne", "legion of everblight", "grymkin", "minion", } else "hordes" ) return Path(f"{system}/{faction}/theme forces") # fallback return Path("common/others") # ---------- main ---------- files = list(INPUT_DIR.glob("*.html")) print(f"{len(files)} pages à classifier") for file_path in files: html = file_path.read_text(encoding="utf-8", errors="ignore") categories = extract_categories(html) dest = OUTPUT_DIR / classify(categories) dest.mkdir(parents=True, exist_ok=True) shutil.copy2(file_path, dest / file_path.name) print("✅ Classification terminée")