first experiments

2026-04-03 07:56:40 +02:00 · 2026-04-03 07:56:40 +02:00 · 36c8bb2354
commit 36c8bb2354
5 changed files with 542 additions and 0 deletions
--- a/analyze_categories.py
+++ b/analyze_categories.py
@ -0,0 +1,63 @@
 import json
 from collections import defaultdict
 INPUT_FILE = "categories.json"
 OUTPUT_FILE = "categories_analysis.json"
 # patterns typiques MediaWiki bruit
 IGNORE_PATTERNS = [
    "pages using",
    "articles needing",
    "redirect",
    "template",
    "tracking",
    "with broken",
    "cleanup",
    "maintenance",
 ]
 def classify(name, count):
    lname = name.lower()
    # technique
    if any(p in lname for p in IGNORE_PATTERNS):
        return "technical"
    # singleton = probablement page miroir
    if count == 1:
        return "singleton"
    if count <= 3:
        return "rare"
    if count < 20:
        return "medium"
    return "high"
 def main():
    with open(INPUT_FILE, encoding="utf-8") as f:
        data = json.load(f)
    groups = defaultdict(list)
    for cat in data["categories"]:
        name = cat["name"]
        count = cat["count"]
        group = classify(name, count)
        groups[group].append({"name": name, "count": count})
    output = {"summary": {k: len(v) for k, v in groups.items()}, "groups": groups}
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print("Analyse terminée →", OUTPUT_FILE)
 if __name__ == "__main__":
    main()
--- a/category_graph_analysis.py
+++ b/category_graph_analysis.py
@ -0,0 +1,88 @@
 import json
 import re
 from pathlib import Path
 from collections import defaultdict, Counter
 INPUT_DIR = Path(".")  # dossier contenant les fichiers
 OUTPUT_FILE = "category_analysis.json"
 # ---------------------------
 # Extraction wgCategories
 # ---------------------------
 WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL)
 def extract_categories(text: str):
    match = WG_RE.search(text)
    if not match:
        return []
    try:
        raw = json.loads(match.group(1))
        return [c.strip() for c in raw if isinstance(c, str)]
    except Exception:
        return []
 # ---------------------------
 # Analyse globale
 # ---------------------------
 category_pages = Counter()
 category_neighbors = defaultdict(set)
 files = list(INPUT_DIR.glob("**/*.html"))
 print(f"{len(files)} fichiers trouvés")
 for file in files:
    if not file.is_file():
        continue
    try:
        text = file.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        continue
    categories = extract_categories(text)
    # IMPORTANT : éviter les doublons dans une même page
    categories = list(set(categories))
    if not categories:
        continue
    # compter occurrences
    for cat in categories:
        category_pages[cat] += 1
    # co-occurrences
    for cat in categories:
        others = set(categories)
        others.remove(cat)
        category_neighbors[cat].update(others)
 print("Total occurrences:", sum(category_pages.values()))
 print("Unique categories:", len(category_pages))
 print("Singletons:", sum(1 for c in category_pages.values() if c == 1))
 # ---------------------------
 # Construction résultat
 # ---------------------------
 result = []
 for cat in category_pages:
    result.append(
        {
            "name": cat,
            "page_count": category_pages[cat],
            "neighbor_count": len(category_neighbors[cat]),
        }
    )
 # tri utile pour analyse
 result.sort(key=lambda x: (-x["page_count"], x["name"]))
 with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)
 print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")
--- a/extract_categories.py
+++ b/extract_categories.py
@ -0,0 +1,134 @@
 import json
 import re
 from pathlib import Path
 from collections import defaultdict
 # =========================
 # CONFIG
 # =========================
 INPUT_DIR = "."  # dossier contenant les 700 fichiers
 OUTPUT_FILE = "categories.json"
 # extensions à analyser
 VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"}
 # =========================
 # REGEX PATTERNS
 # =========================
 # [[Category:Something]]
 CATEGORY_WIKI_RE = re.compile(r"\[\[\s*Category\s*:\s*([^\|\]]+)", re.IGNORECASE)
 # wgCategories:["A","B"]
 WG_CATEGORIES_RE = re.compile(r'wgCategories"\s*:\s*\[(.*?)\]', re.DOTALL)
 # "Category name" inside wgCategories
 WG_CATEGORY_ITEM_RE = re.compile(r'"([^"]+)"')
 # HTML catlinks fallback
 HTML_CATEGORY_RE = re.compile(r"<li>\s*<a[^>]*>(.*?)</a>", re.IGNORECASE)
 # =========================
 # EXTRACTION
 # =========================
 def extract_categories_from_text(text: str):
    found = set()
    # --- MediaWiki [[Category:...]]
    for match in CATEGORY_WIKI_RE.findall(text):
        found.add(match.strip())
    # --- wgCategories JS block
    wg_match = WG_CATEGORIES_RE.search(text)
    if wg_match:
        block = wg_match.group(1)
        for cat in WG_CATEGORY_ITEM_RE.findall(block):
            found.add(cat.strip())
    # --- HTML fallback
    for match in HTML_CATEGORY_RE.findall(text):
        if match.lower() != "categories":
            found.add(match.strip())
    return found
 # =========================
 # MAIN SCAN
 # =========================
 def scan_directory(path: Path):
    categories_count = defaultdict(int)
    scanned_files = 0
    for file in path.rglob("*"):
        if file.suffix.lower() not in VALID_EXTENSIONS:
            continue
        try:
            text = file.read_text(encoding="utf-8", errors="ignore")
        except Exception as e:
            print(f"[SKIP] {file} ({e})")
            continue
        cats = extract_categories_from_text(text)
        for cat in cats:
            categories_count[cat] += 1
        scanned_files += 1
        if scanned_files % 50 == 0:
            print(f"{scanned_files} fichiers analysés...")
    return categories_count
 # =========================
 # EXPORT
 # =========================
 def export_json(categories_count):
    data = {
        "total_unique_categories": len(categories_count),
        "categories": sorted(
            [
                {"name": name, "count": count}
                for name, count in categories_count.items()
            ],
            key=lambda x: (-x["count"], x["name"].lower()),
        ),
    }
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Export terminé → {OUTPUT_FILE}")
    print(f"{len(categories_count)} catégories uniques trouvées.")
 # =========================
 # ENTRYPOINT
 # =========================
 def main():
    path = Path(INPUT_DIR)
    if not path.exists():
        print("Dossier introuvable :", INPUT_DIR)
        return
    print("Analyse des fichiers...\n")
    categories_count = scan_directory(path)
    export_json(categories_count)
 if __name__ == "__main__":
    main()
--- a/remove_duplicate_pages.py
+++ b/remove_duplicate_pages.py
@ -0,0 +1,101 @@
 from pathlib import Path
 import shutil
 import re
 import json
 INPUT_DIR = Path(".")
 OUTPUT_DIR = Path("unique_pages")
 OUTPUT_DIR.mkdir(exist_ok=True)
 ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
 REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)')
 PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"')
 INVALID_CHARS = r'[<>:"/\\|?*]'
 RESERVED_NAMES = {
    "CON",
    "PRN",
    "AUX",
    "NUL",
    *(f"COM{i}" for i in range(1, 10)),
    *(f"LPT{i}" for i in range(1, 10)),
 }
 def sanitize_filename(name: str) -> str:
    # forbidden chars
    name = re.sub(INVALID_CHARS, "_", name)
    # spaces → underscore
    name = name.replace(" ", "_")
    # remove trailing dots/spaces
    name = name.rstrip(". ")
    # Windows reserved names
    if name.upper() in RESERVED_NAMES:
        name = "_" + name
    return name
 articles = {}
 print("start parsing files")
 for file_path in INPUT_DIR.glob("*.html"):
    html = file_path.read_text(encoding="utf-8", errors="ignore")
    # Article ID
    m = ARTICLE_ID_RE.search(html)
    if not m:
        continue
    article_id = int(m.group(1))
    if article_id == 0:
        continue
    # Redirect
    m = REDIRECT_RE.search(html)
    is_redirect = bool(m and m.group(1) == "true")
    # Canonical page name
    m = PAGENAME_RE.search(html)
    if not m:
        continue
    # Decode MediaWiki unicode escapes
    raw_name = m.group(1)
    page_name = json.loads(f'"{raw_name}"')
    # Sanitize filename
    clean_name = page_name.replace("Category:", "")
    clean_name = sanitize_filename(clean_name)
    filename = clean_name + ".html"
    # Selection logic
    if article_id not in articles:
        articles[article_id] = {
            "path": file_path,
            "redirect": is_redirect,
            "filename": filename,
        }
    else:
        # Avoid redirect
        if articles[article_id]["redirect"] and not is_redirect:
            articles[article_id] = {
                "path": file_path,
                "redirect": is_redirect,
                "filename": filename,
            }
 #  Copy
 print("start copying files")
 for art in articles.values():
    dst = OUTPUT_DIR / art["filename"]
    try:
        shutil.copy2(art["path"], dst)
    except OSError as e:
        print("❌ Copy failed:", art["filename"], e)
 print(f"✅ Unique pages kept: {len(articles)}")
--- a/sort_pages.py
+++ b/sort_pages.py
@ -0,0 +1,156 @@
 from pathlib import Path
 import shutil
 import re
 import json
 INPUT_DIR = Path("unique_pages")
 OUTPUT_DIR = Path("classified_pages")
 CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')
 # ---------- CONFIG ----------
 FACTIONS = {
    "cygnar",
    "khador",
    "protectorate of menoth",
    "cryx",
    "retribution of scyrah",
    "convergence of cyriss",
    "cephalyx",
    "mercenary",
    "crucible guard",
    "infernals",
    "trollblood",
    "circle orboros",
    "skorne",
    "legion of everblight",
    "grymkin",
    "minion",
 }
 MODEL_TYPES = {
    "warcaster unit": "warcaster unit",
    "warlock unit": "warlock unit",
    "warcaster": "warcaster",
    "warlock": "warlocks",
    "infernal master": "masters",
    "warjack": "warjacks",
    "warbeast": "warbeasts",
    "horror": "horrors",
    "monstrosity": "monstrosities",
    "solo": "solos",
    "unit": "units",
    "battle engine": "battle engines",
    "structure": "structure",
 }
 # ---------- helpers ----------
 def extract_categories(html: str):
    m = CATEGORY_RE.search(html)
    if not m:
        return []
    raw = "[" + m.group(1) + "]"
    try:
        return [c.lower() for c in json.loads(raw)]
    except:
        return []
 def detect_faction(categories):
    for c in categories:
        if c in FACTIONS:
            return c
    return None
 def detect_model_type(categories):
    for c, folder in MODEL_TYPES.items():
        if c in categories:
            return folder
    return None
 # ---------- classification ----------
 def classify(categories):
    # SPELLS
    if "spell" in categories:
        return Path("common/warmahordes rules/spells")
    if "animus" in categories:
        return Path("common/warmahordes rules/animi")
    if "model ability" in categories:
        return Path("common/warmahordes rules/model advantages")
    if "weapon ability" in categories:
        return Path("common/warmahordes rules/weapon qualities")
    # MODELS
    if "model" in categories:
        faction = detect_faction(categories)
        model_type = detect_model_type(categories)
        if faction and model_type:
            system = (
                "warmachine"
                if faction
                not in {
                    "trollblood",
                    "circle orboros",
                    "skorne",
                    "legion of everblight",
                    "grymkin",
                    "minion",
                }
                else "hordes"
            )
            return Path(f"{system}/{faction}/{model_type}")
    # THEME FORCE
    if "theme force" in categories:
        faction = detect_faction(categories)
        if faction:
            system = (
                "warmachine"
                if faction
                not in {
                    "trollblood",
                    "circle orboros",
                    "skorne",
                    "legion of everblight",
                    "grymkin",
                    "minion",
                }
                else "hordes"
            )
            return Path(f"{system}/{faction}/theme forces")
    # fallback
    return Path("common/others")
 # ---------- main ----------
 files = list(INPUT_DIR.glob("*.html"))
 print(f"{len(files)} pages à classifier")
 for file_path in files:
    html = file_path.read_text(encoding="utf-8", errors="ignore")
    categories = extract_categories(html)
    dest = OUTPUT_DIR / classify(categories)
    dest.mkdir(parents=True, exist_ok=True)
    shutil.copy2(file_path, dest / file_path.name)
 print("✅ Classification terminée")