2026-04-03 07:56:40 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
import shutil
|
|
|
|
|
import re
|
|
|
|
|
import json
|
|
|
|
|
|
2026-04-03 15:50:40 +02:00
|
|
|
INPUT_DIR = Path("../unique_pages")
|
|
|
|
|
OUTPUT_DIR = Path("../classified_pages")
|
2026-04-03 07:56:40 +02:00
|
|
|
|
|
|
|
|
CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')
|
|
|
|
|
|
|
|
|
|
# ---------- CONFIG ----------
|
|
|
|
|
|
|
|
|
|
FACTIONS = {
|
|
|
|
|
"cygnar",
|
|
|
|
|
"khador",
|
|
|
|
|
"protectorate of menoth",
|
|
|
|
|
"cryx",
|
|
|
|
|
"retribution of scyrah",
|
|
|
|
|
"convergence of cyriss",
|
|
|
|
|
"cephalyx",
|
|
|
|
|
"mercenary",
|
|
|
|
|
"crucible guard",
|
|
|
|
|
"infernals",
|
|
|
|
|
"trollblood",
|
|
|
|
|
"circle orboros",
|
|
|
|
|
"skorne",
|
|
|
|
|
"legion of everblight",
|
|
|
|
|
"grymkin",
|
|
|
|
|
"minion",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MODEL_TYPES = {
|
|
|
|
|
"warcaster unit": "warcaster unit",
|
|
|
|
|
"warlock unit": "warlock unit",
|
|
|
|
|
"warcaster": "warcaster",
|
|
|
|
|
"warlock": "warlocks",
|
|
|
|
|
"infernal master": "masters",
|
|
|
|
|
"warjack": "warjacks",
|
|
|
|
|
"warbeast": "warbeasts",
|
|
|
|
|
"horror": "horrors",
|
|
|
|
|
"monstrosity": "monstrosities",
|
|
|
|
|
"solo": "solos",
|
|
|
|
|
"unit": "units",
|
|
|
|
|
"battle engine": "battle engines",
|
|
|
|
|
"structure": "structure",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# ---------- helpers ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_categories(html: str):
|
|
|
|
|
m = CATEGORY_RE.search(html)
|
|
|
|
|
if not m:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
raw = "[" + m.group(1) + "]"
|
|
|
|
|
try:
|
|
|
|
|
return [c.lower() for c in json.loads(raw)]
|
|
|
|
|
except:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_faction(categories):
|
|
|
|
|
for c in categories:
|
|
|
|
|
if c in FACTIONS:
|
|
|
|
|
return c
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_model_type(categories):
|
|
|
|
|
for c, folder in MODEL_TYPES.items():
|
|
|
|
|
if c in categories:
|
|
|
|
|
return folder
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------- classification ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def classify(categories):
|
|
|
|
|
|
|
|
|
|
# SPELLS
|
|
|
|
|
if "spell" in categories:
|
|
|
|
|
return Path("common/warmahordes rules/spells")
|
|
|
|
|
|
|
|
|
|
if "animus" in categories:
|
|
|
|
|
return Path("common/warmahordes rules/animi")
|
|
|
|
|
|
|
|
|
|
if "model ability" in categories:
|
|
|
|
|
return Path("common/warmahordes rules/model advantages")
|
|
|
|
|
|
|
|
|
|
if "weapon ability" in categories:
|
|
|
|
|
return Path("common/warmahordes rules/weapon qualities")
|
|
|
|
|
|
|
|
|
|
# MODELS
|
|
|
|
|
if "model" in categories:
|
|
|
|
|
faction = detect_faction(categories)
|
|
|
|
|
model_type = detect_model_type(categories)
|
|
|
|
|
|
|
|
|
|
if faction and model_type:
|
|
|
|
|
system = (
|
|
|
|
|
"warmachine"
|
|
|
|
|
if faction
|
|
|
|
|
not in {
|
|
|
|
|
"trollblood",
|
|
|
|
|
"circle orboros",
|
|
|
|
|
"skorne",
|
|
|
|
|
"legion of everblight",
|
|
|
|
|
"grymkin",
|
|
|
|
|
"minion",
|
|
|
|
|
}
|
|
|
|
|
else "hordes"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return Path(f"{system}/{faction}/{model_type}")
|
|
|
|
|
|
|
|
|
|
# THEME FORCE
|
|
|
|
|
if "theme force" in categories:
|
|
|
|
|
faction = detect_faction(categories)
|
|
|
|
|
if faction:
|
|
|
|
|
system = (
|
|
|
|
|
"warmachine"
|
|
|
|
|
if faction
|
|
|
|
|
not in {
|
|
|
|
|
"trollblood",
|
|
|
|
|
"circle orboros",
|
|
|
|
|
"skorne",
|
|
|
|
|
"legion of everblight",
|
|
|
|
|
"grymkin",
|
|
|
|
|
"minion",
|
|
|
|
|
}
|
|
|
|
|
else "hordes"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return Path(f"{system}/{faction}/theme forces")
|
|
|
|
|
|
|
|
|
|
# fallback
|
|
|
|
|
return Path("common/others")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------- main ----------
|
|
|
|
|
|
|
|
|
|
files = list(INPUT_DIR.glob("*.html"))
|
|
|
|
|
print(f"{len(files)} pages à classifier")
|
|
|
|
|
|
|
|
|
|
for file_path in files:
|
|
|
|
|
|
|
|
|
|
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
|
|
|
categories = extract_categories(html)
|
|
|
|
|
|
|
|
|
|
dest = OUTPUT_DIR / classify(categories)
|
|
|
|
|
dest.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
shutil.copy2(file_path, dest / file_path.name)
|
|
|
|
|
|
|
|
|
|
print("✅ Classification terminée")
|