first experiments

2026-04-03 07:56:40 +02:00 · 2026-04-03 07:56:40 +02:00 · 36c8bb2354
commit 36c8bb2354
5 changed files with 542 additions and 0 deletions
--- a/analyze_categories.py
+++ b/analyze_categories.py
@ -0,0 +1,63 @@
+import json
+from collections import defaultdict
+
+INPUT_FILE = "categories.json"
+OUTPUT_FILE = "categories_analysis.json"
+
+# patterns typiques MediaWiki bruit
+IGNORE_PATTERNS = [
+    "pages using",
+    "articles needing",
+    "redirect",
+    "template",
+    "tracking",
+    "with broken",
+    "cleanup",
+    "maintenance",
+]
+
+
+def classify(name, count):
+    lname = name.lower()
+
+    # technique
+    if any(p in lname for p in IGNORE_PATTERNS):
+        return "technical"
+
+    # singleton = probablement page miroir
+    if count == 1:
+        return "singleton"
+
+    if count <= 3:
+        return "rare"
+
+    if count < 20:
+        return "medium"
+
+    return "high"
+
+
+def main():
+    with open(INPUT_FILE, encoding="utf-8") as f:
+        data = json.load(f)
+
+    groups = defaultdict(list)
+
+    for cat in data["categories"]:
+        name = cat["name"]
+        count = cat["count"]
+
+        group = classify(name, count)
+
+        groups[group].append({"name": name, "count": count})
+
+    output = {"summary": {k: len(v) for k, v in groups.items()}, "groups": groups}
+
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(output, f, indent=2, ensure_ascii=False)
+
+    print("Analyse terminée →", OUTPUT_FILE)
+
+
+if __name__ == "__main__":
+    main()
--- a/category_graph_analysis.py
+++ b/category_graph_analysis.py
@ -0,0 +1,88 @@
+import json
+import re
+from pathlib import Path
+from collections import defaultdict, Counter
+
+INPUT_DIR = Path(".")  # dossier contenant les fichiers
+OUTPUT_FILE = "category_analysis.json"
+
+
+# ---------------------------
+# Extraction wgCategories
+# ---------------------------
+WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL)
+
+
+def extract_categories(text: str):
+    match = WG_RE.search(text)
+    if not match:
+        return []
+
+    try:
+        raw = json.loads(match.group(1))
+        return [c.strip() for c in raw if isinstance(c, str)]
+    except Exception:
+        return []
+
+
+# ---------------------------
+# Analyse globale
+# ---------------------------
+category_pages = Counter()
+category_neighbors = defaultdict(set)
+
+files = list(INPUT_DIR.glob("**/*.html"))
+
+print(f"{len(files)} fichiers trouvés")
+
+for file in files:
+    if not file.is_file():
+        continue
+
+    try:
+        text = file.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        continue
+
+    categories = extract_categories(text)
+
+    # IMPORTANT : éviter les doublons dans une même page
+    categories = list(set(categories))
+
+    if not categories:
+        continue
+
+    # compter occurrences
+    for cat in categories:
+        category_pages[cat] += 1
+
+    # co-occurrences
+    for cat in categories:
+        others = set(categories)
+        others.remove(cat)
+        category_neighbors[cat].update(others)
+
+print("Total occurrences:", sum(category_pages.values()))
+print("Unique categories:", len(category_pages))
+print("Singletons:", sum(1 for c in category_pages.values() if c == 1))
+# ---------------------------
+# Construction résultat
+# ---------------------------
+result = []
+
+for cat in category_pages:
+    result.append(
+        {
+            "name": cat,
+            "page_count": category_pages[cat],
+            "neighbor_count": len(category_neighbors[cat]),
+        }
+    )
+
+# tri utile pour analyse
+result.sort(key=lambda x: (-x["page_count"], x["name"]))
+
+with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+    json.dump(result, f, indent=2, ensure_ascii=False)
+
+print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")
--- a/extract_categories.py
+++ b/extract_categories.py
@ -0,0 +1,134 @@
+import json
+import re
+from pathlib import Path
+from collections import defaultdict
+
+# =========================
+# CONFIG
+# =========================
+
+INPUT_DIR = "."  # dossier contenant les 700 fichiers
+OUTPUT_FILE = "categories.json"
+
+# extensions à analyser
+VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"}
+
+# =========================
+# REGEX PATTERNS
+# =========================
+
+# [[Category:Something]]
+CATEGORY_WIKI_RE = re.compile(r"\[\[\s*Category\s*:\s*([^\|\]]+)", re.IGNORECASE)
+
+# wgCategories:["A","B"]
+WG_CATEGORIES_RE = re.compile(r'wgCategories"\s*:\s*\[(.*?)\]', re.DOTALL)
+
+# "Category name" inside wgCategories
+WG_CATEGORY_ITEM_RE = re.compile(r'"([^"]+)"')
+
+# HTML catlinks fallback
+HTML_CATEGORY_RE = re.compile(r"<li>\s*<a[^>]*>(.*?)</a>", re.IGNORECASE)
+
+# =========================
+# EXTRACTION
+# =========================
+
+
+def extract_categories_from_text(text: str):
+    found = set()
+
+    # --- MediaWiki [[Category:...]]
+    for match in CATEGORY_WIKI_RE.findall(text):
+        found.add(match.strip())
+
+    # --- wgCategories JS block
+    wg_match = WG_CATEGORIES_RE.search(text)
+    if wg_match:
+        block = wg_match.group(1)
+        for cat in WG_CATEGORY_ITEM_RE.findall(block):
+            found.add(cat.strip())
+
+    # --- HTML fallback
+    for match in HTML_CATEGORY_RE.findall(text):
+        if match.lower() != "categories":
+            found.add(match.strip())
+
+    return found
+
+
+# =========================
+# MAIN SCAN
+# =========================
+
+
+def scan_directory(path: Path):
+    categories_count = defaultdict(int)
+    scanned_files = 0
+
+    for file in path.rglob("*"):
+        if file.suffix.lower() not in VALID_EXTENSIONS:
+            continue
+
+        try:
+            text = file.read_text(encoding="utf-8", errors="ignore")
+        except Exception as e:
+            print(f"[SKIP] {file} ({e})")
+            continue
+
+        cats = extract_categories_from_text(text)
+
+        for cat in cats:
+            categories_count[cat] += 1
+
+        scanned_files += 1
+
+        if scanned_files % 50 == 0:
+            print(f"{scanned_files} fichiers analysés...")
+
+    return categories_count
+
+
+# =========================
+# EXPORT
+# =========================
+
+
+def export_json(categories_count):
+    data = {
+        "total_unique_categories": len(categories_count),
+        "categories": sorted(
+            [
+                {"name": name, "count": count}
+                for name, count in categories_count.items()
+            ],
+            key=lambda x: (-x["count"], x["name"].lower()),
+        ),
+    }
+
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+
+    print(f"\n✅ Export terminé → {OUTPUT_FILE}")
+    print(f"{len(categories_count)} catégories uniques trouvées.")
+
+
+# =========================
+# ENTRYPOINT
+# =========================
+
+
+def main():
+    path = Path(INPUT_DIR)
+
+    if not path.exists():
+        print("Dossier introuvable :", INPUT_DIR)
+        return
+
+    print("Analyse des fichiers...\n")
+
+    categories_count = scan_directory(path)
+    export_json(categories_count)
+
+
+if __name__ == "__main__":
+    main()
--- a/remove_duplicate_pages.py
+++ b/remove_duplicate_pages.py
@ -0,0 +1,101 @@
+from pathlib import Path
+import shutil
+import re
+import json
+
+
+INPUT_DIR = Path(".")
+OUTPUT_DIR = Path("unique_pages")
+OUTPUT_DIR.mkdir(exist_ok=True)
+
+ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
+REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)')
+PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"')
+INVALID_CHARS = r'[<>:"/\\|?*]'
+RESERVED_NAMES = {
+    "CON",
+    "PRN",
+    "AUX",
+    "NUL",
+    *(f"COM{i}" for i in range(1, 10)),
+    *(f"LPT{i}" for i in range(1, 10)),
+}
+
+
+def sanitize_filename(name: str) -> str:
+    # forbidden chars
+    name = re.sub(INVALID_CHARS, "_", name)
+
+    # spaces → underscore
+    name = name.replace(" ", "_")
+
+    # remove trailing dots/spaces
+    name = name.rstrip(". ")
+
+    # Windows reserved names
+    if name.upper() in RESERVED_NAMES:
+        name = "_" + name
+
+    return name
+
+
+articles = {}
+print("start parsing files")
+
+for file_path in INPUT_DIR.glob("*.html"):
+
+    html = file_path.read_text(encoding="utf-8", errors="ignore")
+
+    # Article ID
+    m = ARTICLE_ID_RE.search(html)
+    if not m:
+        continue
+
+    article_id = int(m.group(1))
+    if article_id == 0:
+        continue
+
+    # Redirect
+    m = REDIRECT_RE.search(html)
+    is_redirect = bool(m and m.group(1) == "true")
+
+    # Canonical page name
+    m = PAGENAME_RE.search(html)
+    if not m:
+        continue
+
+    # Decode MediaWiki unicode escapes
+    raw_name = m.group(1)
+    page_name = json.loads(f'"{raw_name}"')
+
+    # Sanitize filename
+    clean_name = page_name.replace("Category:", "")
+    clean_name = sanitize_filename(clean_name)
+
+    filename = clean_name + ".html"
+
+    # Selection logic
+    if article_id not in articles:
+        articles[article_id] = {
+            "path": file_path,
+            "redirect": is_redirect,
+            "filename": filename,
+        }
+    else:
+        # Avoid redirect
+        if articles[article_id]["redirect"] and not is_redirect:
+            articles[article_id] = {
+                "path": file_path,
+                "redirect": is_redirect,
+                "filename": filename,
+            }
+
+#  Copy
+print("start copying files")
+for art in articles.values():
+    dst = OUTPUT_DIR / art["filename"]
+    try:
+        shutil.copy2(art["path"], dst)
+    except OSError as e:
+        print("❌ Copy failed:", art["filename"], e)
+print(f"✅ Unique pages kept: {len(articles)}")
--- a/sort_pages.py
+++ b/sort_pages.py
@ -0,0 +1,156 @@
+from pathlib import Path
+import shutil
+import re
+import json
+
+INPUT_DIR = Path("unique_pages")
+OUTPUT_DIR = Path("classified_pages")
+
+CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')
+
+# ---------- CONFIG ----------
+
+FACTIONS = {
+    "cygnar",
+    "khador",
+    "protectorate of menoth",
+    "cryx",
+    "retribution of scyrah",
+    "convergence of cyriss",
+    "cephalyx",
+    "mercenary",
+    "crucible guard",
+    "infernals",
+    "trollblood",
+    "circle orboros",
+    "skorne",
+    "legion of everblight",
+    "grymkin",
+    "minion",
+}
+
+MODEL_TYPES = {
+    "warcaster unit": "warcaster unit",
+    "warlock unit": "warlock unit",
+    "warcaster": "warcaster",
+    "warlock": "warlocks",
+    "infernal master": "masters",
+    "warjack": "warjacks",
+    "warbeast": "warbeasts",
+    "horror": "horrors",
+    "monstrosity": "monstrosities",
+    "solo": "solos",
+    "unit": "units",
+    "battle engine": "battle engines",
+    "structure": "structure",
+}
+
+# ---------- helpers ----------
+
+
+def extract_categories(html: str):
+    m = CATEGORY_RE.search(html)
+    if not m:
+        return []
+
+    raw = "[" + m.group(1) + "]"
+    try:
+        return [c.lower() for c in json.loads(raw)]
+    except:
+        return []
+
+
+def detect_faction(categories):
+    for c in categories:
+        if c in FACTIONS:
+            return c
+    return None
+
+
+def detect_model_type(categories):
+    for c, folder in MODEL_TYPES.items():
+        if c in categories:
+            return folder
+    return None
+
+
+# ---------- classification ----------
+
+
+def classify(categories):
+
+    # SPELLS
+    if "spell" in categories:
+        return Path("common/warmahordes rules/spells")
+
+    if "animus" in categories:
+        return Path("common/warmahordes rules/animi")
+
+    if "model ability" in categories:
+        return Path("common/warmahordes rules/model advantages")
+
+    if "weapon ability" in categories:
+        return Path("common/warmahordes rules/weapon qualities")
+
+    # MODELS
+    if "model" in categories:
+        faction = detect_faction(categories)
+        model_type = detect_model_type(categories)
+
+        if faction and model_type:
+            system = (
+                "warmachine"
+                if faction
+                not in {
+                    "trollblood",
+                    "circle orboros",
+                    "skorne",
+                    "legion of everblight",
+                    "grymkin",
+                    "minion",
+                }
+                else "hordes"
+            )
+
+            return Path(f"{system}/{faction}/{model_type}")
+
+    # THEME FORCE
+    if "theme force" in categories:
+        faction = detect_faction(categories)
+        if faction:
+            system = (
+                "warmachine"
+                if faction
+                not in {
+                    "trollblood",
+                    "circle orboros",
+                    "skorne",
+                    "legion of everblight",
+                    "grymkin",
+                    "minion",
+                }
+                else "hordes"
+            )
+
+            return Path(f"{system}/{faction}/theme forces")
+
+    # fallback
+    return Path("common/others")
+
+
+# ---------- main ----------
+
+files = list(INPUT_DIR.glob("*.html"))
+print(f"{len(files)} pages à classifier")
+
+for file_path in files:
+
+    html = file_path.read_text(encoding="utf-8", errors="ignore")
+    categories = extract_categories(html)
+
+    dest = OUTPUT_DIR / classify(categories)
+    dest.mkdir(parents=True, exist_ok=True)
+
+    shutil.copy2(file_path, dest / file_path.name)
+
+print("✅ Classification terminée")