first experiments

2026-04-03 07:56:40 +02:00 · 2026-04-03 07:56:40 +02:00 · 36c8bb2354
commit 36c8bb2354
5 changed files with 542 additions and 0 deletions
--- a/category_graph_analysis.py
+++ b/category_graph_analysis.py
@ -0,0 +1,88 @@
+import json
+import re
+from pathlib import Path
+from collections import defaultdict, Counter
+
+INPUT_DIR = Path(".")  # dossier contenant les fichiers
+OUTPUT_FILE = "category_analysis.json"
+
+
+# ---------------------------
+# Extraction wgCategories
+# ---------------------------
+WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL)
+
+
+def extract_categories(text: str):
+    match = WG_RE.search(text)
+    if not match:
+        return []
+
+    try:
+        raw = json.loads(match.group(1))
+        return [c.strip() for c in raw if isinstance(c, str)]
+    except Exception:
+        return []
+
+
+# ---------------------------
+# Analyse globale
+# ---------------------------
+category_pages = Counter()
+category_neighbors = defaultdict(set)
+
+files = list(INPUT_DIR.glob("**/*.html"))
+
+print(f"{len(files)} fichiers trouvés")
+
+for file in files:
+    if not file.is_file():
+        continue
+
+    try:
+        text = file.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        continue
+
+    categories = extract_categories(text)
+
+    # IMPORTANT : éviter les doublons dans une même page
+    categories = list(set(categories))
+
+    if not categories:
+        continue
+
+    # compter occurrences
+    for cat in categories:
+        category_pages[cat] += 1
+
+    # co-occurrences
+    for cat in categories:
+        others = set(categories)
+        others.remove(cat)
+        category_neighbors[cat].update(others)
+
+print("Total occurrences:", sum(category_pages.values()))
+print("Unique categories:", len(category_pages))
+print("Singletons:", sum(1 for c in category_pages.values() if c == 1))
+# ---------------------------
+# Construction résultat
+# ---------------------------
+result = []
+
+for cat in category_pages:
+    result.append(
+        {
+            "name": cat,
+            "page_count": category_pages[cat],
+            "neighbor_count": len(category_neighbors[cat]),
+        }
+    )
+
+# tri utile pour analyse
+result.sort(key=lambda x: (-x["page_count"], x["name"]))
+
+with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+    json.dump(result, f, indent=2, ensure_ascii=False)
+
+print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")