commit 36c8bb23547ea56e6ea7827458677fd1b559d28d Author: maximator Date: Fri Apr 3 07:56:40 2026 +0200 first experiments diff --git a/analyze_categories.py b/analyze_categories.py new file mode 100644 index 0000000..70530e8 --- /dev/null +++ b/analyze_categories.py @@ -0,0 +1,63 @@ +import json +from collections import defaultdict + +INPUT_FILE = "categories.json" +OUTPUT_FILE = "categories_analysis.json" + +# patterns typiques MediaWiki bruit +IGNORE_PATTERNS = [ + "pages using", + "articles needing", + "redirect", + "template", + "tracking", + "with broken", + "cleanup", + "maintenance", +] + + +def classify(name, count): + lname = name.lower() + + # technique + if any(p in lname for p in IGNORE_PATTERNS): + return "technical" + + # singleton = probablement page miroir + if count == 1: + return "singleton" + + if count <= 3: + return "rare" + + if count < 20: + return "medium" + + return "high" + + +def main(): + with open(INPUT_FILE, encoding="utf-8") as f: + data = json.load(f) + + groups = defaultdict(list) + + for cat in data["categories"]: + name = cat["name"] + count = cat["count"] + + group = classify(name, count) + + groups[group].append({"name": name, "count": count}) + + output = {"summary": {k: len(v) for k, v in groups.items()}, "groups": groups} + + with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + json.dump(output, f, indent=2, ensure_ascii=False) + + print("Analyse terminée →", OUTPUT_FILE) + + +if __name__ == "__main__": + main() diff --git a/category_graph_analysis.py b/category_graph_analysis.py new file mode 100644 index 0000000..35f97ba --- /dev/null +++ b/category_graph_analysis.py @@ -0,0 +1,88 @@ +import json +import re +from pathlib import Path +from collections import defaultdict, Counter + +INPUT_DIR = Path(".") # dossier contenant les fichiers +OUTPUT_FILE = "category_analysis.json" + + +# --------------------------- +# Extraction wgCategories +# --------------------------- +WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL) + + +def extract_categories(text: str): + match = WG_RE.search(text) + if not match: + return [] + + try: + raw = json.loads(match.group(1)) + return [c.strip() for c in raw if isinstance(c, str)] + except Exception: + return [] + + +# --------------------------- +# Analyse globale +# --------------------------- +category_pages = Counter() +category_neighbors = defaultdict(set) + +files = list(INPUT_DIR.glob("**/*.html")) + +print(f"{len(files)} fichiers trouvés") + +for file in files: + if not file.is_file(): + continue + + try: + text = file.read_text(encoding="utf-8", errors="ignore") + except Exception: + continue + + categories = extract_categories(text) + + # IMPORTANT : éviter les doublons dans une même page + categories = list(set(categories)) + + if not categories: + continue + + # compter occurrences + for cat in categories: + category_pages[cat] += 1 + + # co-occurrences + for cat in categories: + others = set(categories) + others.remove(cat) + category_neighbors[cat].update(others) + +print("Total occurrences:", sum(category_pages.values())) +print("Unique categories:", len(category_pages)) +print("Singletons:", sum(1 for c in category_pages.values() if c == 1)) +# --------------------------- +# Construction résultat +# --------------------------- +result = [] + +for cat in category_pages: + result.append( + { + "name": cat, + "page_count": category_pages[cat], + "neighbor_count": len(category_neighbors[cat]), + } + ) + +# tri utile pour analyse +result.sort(key=lambda x: (-x["page_count"], x["name"])) + +with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + json.dump(result, f, indent=2, ensure_ascii=False) + +print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}") diff --git a/extract_categories.py b/extract_categories.py new file mode 100644 index 0000000..bd1ef55 --- /dev/null +++ b/extract_categories.py @@ -0,0 +1,134 @@ +import json +import re +from pathlib import Path +from collections import defaultdict + +# ========================= +# CONFIG +# ========================= + +INPUT_DIR = "." # dossier contenant les 700 fichiers +OUTPUT_FILE = "categories.json" + +# extensions à analyser +VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"} + +# ========================= +# REGEX PATTERNS +# ========================= + +# [[Category:Something]] +CATEGORY_WIKI_RE = re.compile(r"\[\[\s*Category\s*:\s*([^\|\]]+)", re.IGNORECASE) + +# wgCategories:["A","B"] +WG_CATEGORIES_RE = re.compile(r'wgCategories"\s*:\s*\[(.*?)\]', re.DOTALL) + +# "Category name" inside wgCategories +WG_CATEGORY_ITEM_RE = re.compile(r'"([^"]+)"') + +# HTML catlinks fallback +HTML_CATEGORY_RE = re.compile(r"
  • \s*]*>(.*?)", re.IGNORECASE) + +# ========================= +# EXTRACTION +# ========================= + + +def extract_categories_from_text(text: str): + found = set() + + # --- MediaWiki [[Category:...]] + for match in CATEGORY_WIKI_RE.findall(text): + found.add(match.strip()) + + # --- wgCategories JS block + wg_match = WG_CATEGORIES_RE.search(text) + if wg_match: + block = wg_match.group(1) + for cat in WG_CATEGORY_ITEM_RE.findall(block): + found.add(cat.strip()) + + # --- HTML fallback + for match in HTML_CATEGORY_RE.findall(text): + if match.lower() != "categories": + found.add(match.strip()) + + return found + + +# ========================= +# MAIN SCAN +# ========================= + + +def scan_directory(path: Path): + categories_count = defaultdict(int) + scanned_files = 0 + + for file in path.rglob("*"): + if file.suffix.lower() not in VALID_EXTENSIONS: + continue + + try: + text = file.read_text(encoding="utf-8", errors="ignore") + except Exception as e: + print(f"[SKIP] {file} ({e})") + continue + + cats = extract_categories_from_text(text) + + for cat in cats: + categories_count[cat] += 1 + + scanned_files += 1 + + if scanned_files % 50 == 0: + print(f"{scanned_files} fichiers analysés...") + + return categories_count + + +# ========================= +# EXPORT +# ========================= + + +def export_json(categories_count): + data = { + "total_unique_categories": len(categories_count), + "categories": sorted( + [ + {"name": name, "count": count} + for name, count in categories_count.items() + ], + key=lambda x: (-x["count"], x["name"].lower()), + ), + } + + with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + print(f"\n✅ Export terminé → {OUTPUT_FILE}") + print(f"{len(categories_count)} catégories uniques trouvées.") + + +# ========================= +# ENTRYPOINT +# ========================= + + +def main(): + path = Path(INPUT_DIR) + + if not path.exists(): + print("Dossier introuvable :", INPUT_DIR) + return + + print("Analyse des fichiers...\n") + + categories_count = scan_directory(path) + export_json(categories_count) + + +if __name__ == "__main__": + main() diff --git a/remove_duplicate_pages.py b/remove_duplicate_pages.py new file mode 100644 index 0000000..20e5d8c --- /dev/null +++ b/remove_duplicate_pages.py @@ -0,0 +1,101 @@ +from pathlib import Path +import shutil +import re +import json + + +INPUT_DIR = Path(".") +OUTPUT_DIR = Path("unique_pages") +OUTPUT_DIR.mkdir(exist_ok=True) + +ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)') +REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)') +PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"') +INVALID_CHARS = r'[<>:"/\\|?*]' +RESERVED_NAMES = { + "CON", + "PRN", + "AUX", + "NUL", + *(f"COM{i}" for i in range(1, 10)), + *(f"LPT{i}" for i in range(1, 10)), +} + + +def sanitize_filename(name: str) -> str: + # forbidden chars + name = re.sub(INVALID_CHARS, "_", name) + + # spaces → underscore + name = name.replace(" ", "_") + + # remove trailing dots/spaces + name = name.rstrip(". ") + + # Windows reserved names + if name.upper() in RESERVED_NAMES: + name = "_" + name + + return name + + +articles = {} +print("start parsing files") + +for file_path in INPUT_DIR.glob("*.html"): + + html = file_path.read_text(encoding="utf-8", errors="ignore") + + # Article ID + m = ARTICLE_ID_RE.search(html) + if not m: + continue + + article_id = int(m.group(1)) + if article_id == 0: + continue + + # Redirect + m = REDIRECT_RE.search(html) + is_redirect = bool(m and m.group(1) == "true") + + # Canonical page name + m = PAGENAME_RE.search(html) + if not m: + continue + + # Decode MediaWiki unicode escapes + raw_name = m.group(1) + page_name = json.loads(f'"{raw_name}"') + + # Sanitize filename + clean_name = page_name.replace("Category:", "") + clean_name = sanitize_filename(clean_name) + + filename = clean_name + ".html" + + # Selection logic + if article_id not in articles: + articles[article_id] = { + "path": file_path, + "redirect": is_redirect, + "filename": filename, + } + else: + # Avoid redirect + if articles[article_id]["redirect"] and not is_redirect: + articles[article_id] = { + "path": file_path, + "redirect": is_redirect, + "filename": filename, + } + +# Copy +print("start copying files") +for art in articles.values(): + dst = OUTPUT_DIR / art["filename"] + try: + shutil.copy2(art["path"], dst) + except OSError as e: + print("❌ Copy failed:", art["filename"], e) +print(f"✅ Unique pages kept: {len(articles)}") diff --git a/sort_pages.py b/sort_pages.py new file mode 100644 index 0000000..59eeaef --- /dev/null +++ b/sort_pages.py @@ -0,0 +1,156 @@ +from pathlib import Path +import shutil +import re +import json + +INPUT_DIR = Path("unique_pages") +OUTPUT_DIR = Path("classified_pages") + +CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]') + +# ---------- CONFIG ---------- + +FACTIONS = { + "cygnar", + "khador", + "protectorate of menoth", + "cryx", + "retribution of scyrah", + "convergence of cyriss", + "cephalyx", + "mercenary", + "crucible guard", + "infernals", + "trollblood", + "circle orboros", + "skorne", + "legion of everblight", + "grymkin", + "minion", +} + +MODEL_TYPES = { + "warcaster unit": "warcaster unit", + "warlock unit": "warlock unit", + "warcaster": "warcaster", + "warlock": "warlocks", + "infernal master": "masters", + "warjack": "warjacks", + "warbeast": "warbeasts", + "horror": "horrors", + "monstrosity": "monstrosities", + "solo": "solos", + "unit": "units", + "battle engine": "battle engines", + "structure": "structure", +} + +# ---------- helpers ---------- + + +def extract_categories(html: str): + m = CATEGORY_RE.search(html) + if not m: + return [] + + raw = "[" + m.group(1) + "]" + try: + return [c.lower() for c in json.loads(raw)] + except: + return [] + + +def detect_faction(categories): + for c in categories: + if c in FACTIONS: + return c + return None + + +def detect_model_type(categories): + for c, folder in MODEL_TYPES.items(): + if c in categories: + return folder + return None + + +# ---------- classification ---------- + + +def classify(categories): + + # SPELLS + if "spell" in categories: + return Path("common/warmahordes rules/spells") + + if "animus" in categories: + return Path("common/warmahordes rules/animi") + + if "model ability" in categories: + return Path("common/warmahordes rules/model advantages") + + if "weapon ability" in categories: + return Path("common/warmahordes rules/weapon qualities") + + # MODELS + if "model" in categories: + faction = detect_faction(categories) + model_type = detect_model_type(categories) + + if faction and model_type: + system = ( + "warmachine" + if faction + not in { + "trollblood", + "circle orboros", + "skorne", + "legion of everblight", + "grymkin", + "minion", + } + else "hordes" + ) + + return Path(f"{system}/{faction}/{model_type}") + + # THEME FORCE + if "theme force" in categories: + faction = detect_faction(categories) + if faction: + system = ( + "warmachine" + if faction + not in { + "trollblood", + "circle orboros", + "skorne", + "legion of everblight", + "grymkin", + "minion", + } + else "hordes" + ) + + return Path(f"{system}/{faction}/theme forces") + + # fallback + return Path("common/others") + + +# ---------- main ---------- + +files = list(INPUT_DIR.glob("*.html")) +print(f"{len(files)} pages à classifier") + +for file_path in files: + + html = file_path.read_text(encoding="utf-8", errors="ignore") + categories = extract_categories(html) + + dest = OUTPUT_DIR / classify(categories) + dest.mkdir(parents=True, exist_ok=True) + + shutil.copy2(file_path, dest / file_path.name) + +print("✅ Classification terminée")