89 lines
2.1 KiB
Python
89 lines
2.1 KiB
Python
|
|
import json
|
||
|
|
import re
|
||
|
|
from pathlib import Path
|
||
|
|
from collections import defaultdict, Counter
|
||
|
|
|
||
|
|
INPUT_DIR = Path(".") # dossier contenant les fichiers
|
||
|
|
OUTPUT_FILE = "category_analysis.json"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------
|
||
|
|
# Extraction wgCategories
|
||
|
|
# ---------------------------
|
||
|
|
WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL)
|
||
|
|
|
||
|
|
|
||
|
|
def extract_categories(text: str):
|
||
|
|
match = WG_RE.search(text)
|
||
|
|
if not match:
|
||
|
|
return []
|
||
|
|
|
||
|
|
try:
|
||
|
|
raw = json.loads(match.group(1))
|
||
|
|
return [c.strip() for c in raw if isinstance(c, str)]
|
||
|
|
except Exception:
|
||
|
|
return []
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------
|
||
|
|
# Analyse globale
|
||
|
|
# ---------------------------
|
||
|
|
category_pages = Counter()
|
||
|
|
category_neighbors = defaultdict(set)
|
||
|
|
|
||
|
|
files = list(INPUT_DIR.glob("**/*.html"))
|
||
|
|
|
||
|
|
print(f"{len(files)} fichiers trouvés")
|
||
|
|
|
||
|
|
for file in files:
|
||
|
|
if not file.is_file():
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
text = file.read_text(encoding="utf-8", errors="ignore")
|
||
|
|
except Exception:
|
||
|
|
continue
|
||
|
|
|
||
|
|
categories = extract_categories(text)
|
||
|
|
|
||
|
|
# IMPORTANT : éviter les doublons dans une même page
|
||
|
|
categories = list(set(categories))
|
||
|
|
|
||
|
|
if not categories:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# compter occurrences
|
||
|
|
for cat in categories:
|
||
|
|
category_pages[cat] += 1
|
||
|
|
|
||
|
|
# co-occurrences
|
||
|
|
for cat in categories:
|
||
|
|
others = set(categories)
|
||
|
|
others.remove(cat)
|
||
|
|
category_neighbors[cat].update(others)
|
||
|
|
|
||
|
|
print("Total occurrences:", sum(category_pages.values()))
|
||
|
|
print("Unique categories:", len(category_pages))
|
||
|
|
print("Singletons:", sum(1 for c in category_pages.values() if c == 1))
|
||
|
|
# ---------------------------
|
||
|
|
# Construction résultat
|
||
|
|
# ---------------------------
|
||
|
|
result = []
|
||
|
|
|
||
|
|
for cat in category_pages:
|
||
|
|
result.append(
|
||
|
|
{
|
||
|
|
"name": cat,
|
||
|
|
"page_count": category_pages[cat],
|
||
|
|
"neighbor_count": len(category_neighbors[cat]),
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
# tri utile pour analyse
|
||
|
|
result.sort(key=lambda x: (-x["page_count"], x["name"]))
|
||
|
|
|
||
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
||
|
|
|
||
|
|
print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")
|