first experiments
This commit is contained in:
commit
36c8bb2354
5 changed files with 542 additions and 0 deletions
156
sort_pages.py
Normal file
156
sort_pages.py
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
from pathlib import Path
|
||||
import shutil
|
||||
import re
|
||||
import json
|
||||
|
||||
INPUT_DIR = Path("unique_pages")
|
||||
OUTPUT_DIR = Path("classified_pages")
|
||||
|
||||
CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')
|
||||
|
||||
# ---------- CONFIG ----------
|
||||
|
||||
FACTIONS = {
|
||||
"cygnar",
|
||||
"khador",
|
||||
"protectorate of menoth",
|
||||
"cryx",
|
||||
"retribution of scyrah",
|
||||
"convergence of cyriss",
|
||||
"cephalyx",
|
||||
"mercenary",
|
||||
"crucible guard",
|
||||
"infernals",
|
||||
"trollblood",
|
||||
"circle orboros",
|
||||
"skorne",
|
||||
"legion of everblight",
|
||||
"grymkin",
|
||||
"minion",
|
||||
}
|
||||
|
||||
MODEL_TYPES = {
|
||||
"warcaster unit": "warcaster unit",
|
||||
"warlock unit": "warlock unit",
|
||||
"warcaster": "warcaster",
|
||||
"warlock": "warlocks",
|
||||
"infernal master": "masters",
|
||||
"warjack": "warjacks",
|
||||
"warbeast": "warbeasts",
|
||||
"horror": "horrors",
|
||||
"monstrosity": "monstrosities",
|
||||
"solo": "solos",
|
||||
"unit": "units",
|
||||
"battle engine": "battle engines",
|
||||
"structure": "structure",
|
||||
}
|
||||
|
||||
# ---------- helpers ----------
|
||||
|
||||
|
||||
def extract_categories(html: str):
|
||||
m = CATEGORY_RE.search(html)
|
||||
if not m:
|
||||
return []
|
||||
|
||||
raw = "[" + m.group(1) + "]"
|
||||
try:
|
||||
return [c.lower() for c in json.loads(raw)]
|
||||
except:
|
||||
return []
|
||||
|
||||
|
||||
def detect_faction(categories):
|
||||
for c in categories:
|
||||
if c in FACTIONS:
|
||||
return c
|
||||
return None
|
||||
|
||||
|
||||
def detect_model_type(categories):
|
||||
for c, folder in MODEL_TYPES.items():
|
||||
if c in categories:
|
||||
return folder
|
||||
return None
|
||||
|
||||
|
||||
# ---------- classification ----------
|
||||
|
||||
|
||||
def classify(categories):
|
||||
|
||||
# SPELLS
|
||||
if "spell" in categories:
|
||||
return Path("common/warmahordes rules/spells")
|
||||
|
||||
if "animus" in categories:
|
||||
return Path("common/warmahordes rules/animi")
|
||||
|
||||
if "model ability" in categories:
|
||||
return Path("common/warmahordes rules/model advantages")
|
||||
|
||||
if "weapon ability" in categories:
|
||||
return Path("common/warmahordes rules/weapon qualities")
|
||||
|
||||
# MODELS
|
||||
if "model" in categories:
|
||||
faction = detect_faction(categories)
|
||||
model_type = detect_model_type(categories)
|
||||
|
||||
if faction and model_type:
|
||||
system = (
|
||||
"warmachine"
|
||||
if faction
|
||||
not in {
|
||||
"trollblood",
|
||||
"circle orboros",
|
||||
"skorne",
|
||||
"legion of everblight",
|
||||
"grymkin",
|
||||
"minion",
|
||||
}
|
||||
else "hordes"
|
||||
)
|
||||
|
||||
return Path(f"{system}/{faction}/{model_type}")
|
||||
|
||||
# THEME FORCE
|
||||
if "theme force" in categories:
|
||||
faction = detect_faction(categories)
|
||||
if faction:
|
||||
system = (
|
||||
"warmachine"
|
||||
if faction
|
||||
not in {
|
||||
"trollblood",
|
||||
"circle orboros",
|
||||
"skorne",
|
||||
"legion of everblight",
|
||||
"grymkin",
|
||||
"minion",
|
||||
}
|
||||
else "hordes"
|
||||
)
|
||||
|
||||
return Path(f"{system}/{faction}/theme forces")
|
||||
|
||||
# fallback
|
||||
return Path("common/others")
|
||||
|
||||
|
||||
# ---------- main ----------
|
||||
|
||||
files = list(INPUT_DIR.glob("*.html"))
|
||||
print(f"{len(files)} pages à classifier")
|
||||
|
||||
for file_path in files:
|
||||
|
||||
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
categories = extract_categories(html)
|
||||
|
||||
dest = OUTPUT_DIR / classify(categories)
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
shutil.copy2(file_path, dest / file_path.name)
|
||||
|
||||
print("✅ Classification terminée")
|
||||
Loading…
Add table
Add a link
Reference in a new issue