import json import re from pathlib import Path from collections import defaultdict, Counter INPUT_DIR = Path(".") # dossier contenant les fichiers OUTPUT_FILE = "category_analysis.json" # --------------------------- # Extraction wgCategories # --------------------------- WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL) def extract_categories(text: str): match = WG_RE.search(text) if not match: return [] try: raw = json.loads(match.group(1)) return [c.strip() for c in raw if isinstance(c, str)] except Exception: return [] # --------------------------- # Analyse globale # --------------------------- category_pages = Counter() category_neighbors = defaultdict(set) files = list(INPUT_DIR.glob("**/*.html")) print(f"{len(files)} fichiers trouvés") for file in files: if not file.is_file(): continue try: text = file.read_text(encoding="utf-8", errors="ignore") except Exception: continue categories = extract_categories(text) # IMPORTANT : éviter les doublons dans une même page categories = list(set(categories)) if not categories: continue # compter occurrences for cat in categories: category_pages[cat] += 1 # co-occurrences for cat in categories: others = set(categories) others.remove(cat) category_neighbors[cat].update(others) print("Total occurrences:", sum(category_pages.values())) print("Unique categories:", len(category_pages)) print("Singletons:", sum(1 for c in category_pages.values() if c == 1)) # --------------------------- # Construction résultat # --------------------------- result = [] for cat in category_pages: result.append( { "name": cat, "page_count": category_pages[cat], "neighbor_count": len(category_neighbors[cat]), } ) # tri utile pour analyse result.sort(key=lambda x: (-x["page_count"], x["name"])) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(result, f, indent=2, ensure_ascii=False) print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")