whu_migration_scripts/category_graph_analysis.py

89 lines
2.1 KiB
Python
Raw Normal View History

2026-04-03 07:56:40 +02:00
import json
import re
from pathlib import Path
from collections import defaultdict, Counter
2026-04-03 15:50:40 +02:00
INPUT_DIR = Path("../unique_pages")
OUTPUT_FILE = "../category_analysis.json"
2026-04-03 07:56:40 +02:00
# ---------------------------
# Extraction wgCategories
# ---------------------------
WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL)
def extract_categories(text: str):
match = WG_RE.search(text)
if not match:
return []
try:
raw = json.loads(match.group(1))
return [c.strip() for c in raw if isinstance(c, str)]
except Exception:
return []
# ---------------------------
# Analyse globale
# ---------------------------
category_pages = Counter()
category_neighbors = defaultdict(set)
files = list(INPUT_DIR.glob("**/*.html"))
print(f"{len(files)} fichiers trouvés")
for file in files:
if not file.is_file():
continue
try:
text = file.read_text(encoding="utf-8", errors="ignore")
except Exception:
continue
categories = extract_categories(text)
# IMPORTANT : éviter les doublons dans une même page
categories = list(set(categories))
if not categories:
continue
# compter occurrences
for cat in categories:
category_pages[cat] += 1
# co-occurrences
for cat in categories:
others = set(categories)
others.remove(cat)
category_neighbors[cat].update(others)
print("Total occurrences:", sum(category_pages.values()))
print("Unique categories:", len(category_pages))
print("Singletons:", sum(1 for c in category_pages.values() if c == 1))
# ---------------------------
# Construction résultat
# ---------------------------
result = []
for cat in category_pages:
result.append(
{
"name": cat,
"page_count": category_pages[cat],
"neighbor_count": len(category_neighbors[cat]),
}
)
# tri utile pour analyse
result.sort(key=lambda x: (-x["page_count"], x["name"]))
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")