first experiments
This commit is contained in:
commit
36c8bb2354
5 changed files with 542 additions and 0 deletions
88
category_graph_analysis.py
Normal file
88
category_graph_analysis.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
INPUT_DIR = Path(".") # dossier contenant les fichiers
|
||||
OUTPUT_FILE = "category_analysis.json"
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# Extraction wgCategories
|
||||
# ---------------------------
|
||||
WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL)
|
||||
|
||||
|
||||
def extract_categories(text: str):
|
||||
match = WG_RE.search(text)
|
||||
if not match:
|
||||
return []
|
||||
|
||||
try:
|
||||
raw = json.loads(match.group(1))
|
||||
return [c.strip() for c in raw if isinstance(c, str)]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# Analyse globale
|
||||
# ---------------------------
|
||||
category_pages = Counter()
|
||||
category_neighbors = defaultdict(set)
|
||||
|
||||
files = list(INPUT_DIR.glob("**/*.html"))
|
||||
|
||||
print(f"{len(files)} fichiers trouvés")
|
||||
|
||||
for file in files:
|
||||
if not file.is_file():
|
||||
continue
|
||||
|
||||
try:
|
||||
text = file.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
categories = extract_categories(text)
|
||||
|
||||
# IMPORTANT : éviter les doublons dans une même page
|
||||
categories = list(set(categories))
|
||||
|
||||
if not categories:
|
||||
continue
|
||||
|
||||
# compter occurrences
|
||||
for cat in categories:
|
||||
category_pages[cat] += 1
|
||||
|
||||
# co-occurrences
|
||||
for cat in categories:
|
||||
others = set(categories)
|
||||
others.remove(cat)
|
||||
category_neighbors[cat].update(others)
|
||||
|
||||
print("Total occurrences:", sum(category_pages.values()))
|
||||
print("Unique categories:", len(category_pages))
|
||||
print("Singletons:", sum(1 for c in category_pages.values() if c == 1))
|
||||
# ---------------------------
|
||||
# Construction résultat
|
||||
# ---------------------------
|
||||
result = []
|
||||
|
||||
for cat in category_pages:
|
||||
result.append(
|
||||
{
|
||||
"name": cat,
|
||||
"page_count": category_pages[cat],
|
||||
"neighbor_count": len(category_neighbors[cat]),
|
||||
}
|
||||
)
|
||||
|
||||
# tri utile pour analyse
|
||||
result.sort(key=lambda x: (-x["page_count"], x["name"]))
|
||||
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue