first experiments

This commit is contained in:
maximator 2026-04-03 07:56:40 +02:00
commit 36c8bb2354
5 changed files with 542 additions and 0 deletions

63
analyze_categories.py Normal file
View file

@ -0,0 +1,63 @@
import json
from collections import defaultdict
INPUT_FILE = "categories.json"
OUTPUT_FILE = "categories_analysis.json"
# patterns typiques MediaWiki bruit
IGNORE_PATTERNS = [
"pages using",
"articles needing",
"redirect",
"template",
"tracking",
"with broken",
"cleanup",
"maintenance",
]
def classify(name, count):
lname = name.lower()
# technique
if any(p in lname for p in IGNORE_PATTERNS):
return "technical"
# singleton = probablement page miroir
if count == 1:
return "singleton"
if count <= 3:
return "rare"
if count < 20:
return "medium"
return "high"
def main():
with open(INPUT_FILE, encoding="utf-8") as f:
data = json.load(f)
groups = defaultdict(list)
for cat in data["categories"]:
name = cat["name"]
count = cat["count"]
group = classify(name, count)
groups[group].append({"name": name, "count": count})
output = {"summary": {k: len(v) for k, v in groups.items()}, "groups": groups}
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print("Analyse terminée →", OUTPUT_FILE)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,88 @@
import json
import re
from pathlib import Path
from collections import defaultdict, Counter
INPUT_DIR = Path(".") # dossier contenant les fichiers
OUTPUT_FILE = "category_analysis.json"
# ---------------------------
# Extraction wgCategories
# ---------------------------
WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL)
def extract_categories(text: str):
match = WG_RE.search(text)
if not match:
return []
try:
raw = json.loads(match.group(1))
return [c.strip() for c in raw if isinstance(c, str)]
except Exception:
return []
# ---------------------------
# Analyse globale
# ---------------------------
category_pages = Counter()
category_neighbors = defaultdict(set)
files = list(INPUT_DIR.glob("**/*.html"))
print(f"{len(files)} fichiers trouvés")
for file in files:
if not file.is_file():
continue
try:
text = file.read_text(encoding="utf-8", errors="ignore")
except Exception:
continue
categories = extract_categories(text)
# IMPORTANT : éviter les doublons dans une même page
categories = list(set(categories))
if not categories:
continue
# compter occurrences
for cat in categories:
category_pages[cat] += 1
# co-occurrences
for cat in categories:
others = set(categories)
others.remove(cat)
category_neighbors[cat].update(others)
print("Total occurrences:", sum(category_pages.values()))
print("Unique categories:", len(category_pages))
print("Singletons:", sum(1 for c in category_pages.values() if c == 1))
# ---------------------------
# Construction résultat
# ---------------------------
result = []
for cat in category_pages:
result.append(
{
"name": cat,
"page_count": category_pages[cat],
"neighbor_count": len(category_neighbors[cat]),
}
)
# tri utile pour analyse
result.sort(key=lambda x: (-x["page_count"], x["name"]))
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")

134
extract_categories.py Normal file
View file

@ -0,0 +1,134 @@
import json
import re
from pathlib import Path
from collections import defaultdict
# =========================
# CONFIG
# =========================
INPUT_DIR = "." # dossier contenant les 700 fichiers
OUTPUT_FILE = "categories.json"
# extensions à analyser
VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"}
# =========================
# REGEX PATTERNS
# =========================
# [[Category:Something]]
CATEGORY_WIKI_RE = re.compile(r"\[\[\s*Category\s*:\s*([^\|\]]+)", re.IGNORECASE)
# wgCategories:["A","B"]
WG_CATEGORIES_RE = re.compile(r'wgCategories"\s*:\s*\[(.*?)\]', re.DOTALL)
# "Category name" inside wgCategories
WG_CATEGORY_ITEM_RE = re.compile(r'"([^"]+)"')
# HTML catlinks fallback
HTML_CATEGORY_RE = re.compile(r"<li>\s*<a[^>]*>(.*?)</a>", re.IGNORECASE)
# =========================
# EXTRACTION
# =========================
def extract_categories_from_text(text: str):
found = set()
# --- MediaWiki [[Category:...]]
for match in CATEGORY_WIKI_RE.findall(text):
found.add(match.strip())
# --- wgCategories JS block
wg_match = WG_CATEGORIES_RE.search(text)
if wg_match:
block = wg_match.group(1)
for cat in WG_CATEGORY_ITEM_RE.findall(block):
found.add(cat.strip())
# --- HTML fallback
for match in HTML_CATEGORY_RE.findall(text):
if match.lower() != "categories":
found.add(match.strip())
return found
# =========================
# MAIN SCAN
# =========================
def scan_directory(path: Path):
categories_count = defaultdict(int)
scanned_files = 0
for file in path.rglob("*"):
if file.suffix.lower() not in VALID_EXTENSIONS:
continue
try:
text = file.read_text(encoding="utf-8", errors="ignore")
except Exception as e:
print(f"[SKIP] {file} ({e})")
continue
cats = extract_categories_from_text(text)
for cat in cats:
categories_count[cat] += 1
scanned_files += 1
if scanned_files % 50 == 0:
print(f"{scanned_files} fichiers analysés...")
return categories_count
# =========================
# EXPORT
# =========================
def export_json(categories_count):
data = {
"total_unique_categories": len(categories_count),
"categories": sorted(
[
{"name": name, "count": count}
for name, count in categories_count.items()
],
key=lambda x: (-x["count"], x["name"].lower()),
),
}
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\n✅ Export terminé → {OUTPUT_FILE}")
print(f"{len(categories_count)} catégories uniques trouvées.")
# =========================
# ENTRYPOINT
# =========================
def main():
path = Path(INPUT_DIR)
if not path.exists():
print("Dossier introuvable :", INPUT_DIR)
return
print("Analyse des fichiers...\n")
categories_count = scan_directory(path)
export_json(categories_count)
if __name__ == "__main__":
main()

101
remove_duplicate_pages.py Normal file
View file

@ -0,0 +1,101 @@
from pathlib import Path
import shutil
import re
import json
INPUT_DIR = Path(".")
OUTPUT_DIR = Path("unique_pages")
OUTPUT_DIR.mkdir(exist_ok=True)
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)')
PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"')
INVALID_CHARS = r'[<>:"/\\|?*]'
RESERVED_NAMES = {
"CON",
"PRN",
"AUX",
"NUL",
*(f"COM{i}" for i in range(1, 10)),
*(f"LPT{i}" for i in range(1, 10)),
}
def sanitize_filename(name: str) -> str:
# forbidden chars
name = re.sub(INVALID_CHARS, "_", name)
# spaces → underscore
name = name.replace(" ", "_")
# remove trailing dots/spaces
name = name.rstrip(". ")
# Windows reserved names
if name.upper() in RESERVED_NAMES:
name = "_" + name
return name
articles = {}
print("start parsing files")
for file_path in INPUT_DIR.glob("*.html"):
html = file_path.read_text(encoding="utf-8", errors="ignore")
# Article ID
m = ARTICLE_ID_RE.search(html)
if not m:
continue
article_id = int(m.group(1))
if article_id == 0:
continue
# Redirect
m = REDIRECT_RE.search(html)
is_redirect = bool(m and m.group(1) == "true")
# Canonical page name
m = PAGENAME_RE.search(html)
if not m:
continue
# Decode MediaWiki unicode escapes
raw_name = m.group(1)
page_name = json.loads(f'"{raw_name}"')
# Sanitize filename
clean_name = page_name.replace("Category:", "")
clean_name = sanitize_filename(clean_name)
filename = clean_name + ".html"
# Selection logic
if article_id not in articles:
articles[article_id] = {
"path": file_path,
"redirect": is_redirect,
"filename": filename,
}
else:
# Avoid redirect
if articles[article_id]["redirect"] and not is_redirect:
articles[article_id] = {
"path": file_path,
"redirect": is_redirect,
"filename": filename,
}
# Copy
print("start copying files")
for art in articles.values():
dst = OUTPUT_DIR / art["filename"]
try:
shutil.copy2(art["path"], dst)
except OSError as e:
print("❌ Copy failed:", art["filename"], e)
print(f"✅ Unique pages kept: {len(articles)}")

156
sort_pages.py Normal file
View file

@ -0,0 +1,156 @@
from pathlib import Path
import shutil
import re
import json
INPUT_DIR = Path("unique_pages")
OUTPUT_DIR = Path("classified_pages")
CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')
# ---------- CONFIG ----------
FACTIONS = {
"cygnar",
"khador",
"protectorate of menoth",
"cryx",
"retribution of scyrah",
"convergence of cyriss",
"cephalyx",
"mercenary",
"crucible guard",
"infernals",
"trollblood",
"circle orboros",
"skorne",
"legion of everblight",
"grymkin",
"minion",
}
MODEL_TYPES = {
"warcaster unit": "warcaster unit",
"warlock unit": "warlock unit",
"warcaster": "warcaster",
"warlock": "warlocks",
"infernal master": "masters",
"warjack": "warjacks",
"warbeast": "warbeasts",
"horror": "horrors",
"monstrosity": "monstrosities",
"solo": "solos",
"unit": "units",
"battle engine": "battle engines",
"structure": "structure",
}
# ---------- helpers ----------
def extract_categories(html: str):
m = CATEGORY_RE.search(html)
if not m:
return []
raw = "[" + m.group(1) + "]"
try:
return [c.lower() for c in json.loads(raw)]
except:
return []
def detect_faction(categories):
for c in categories:
if c in FACTIONS:
return c
return None
def detect_model_type(categories):
for c, folder in MODEL_TYPES.items():
if c in categories:
return folder
return None
# ---------- classification ----------
def classify(categories):
# SPELLS
if "spell" in categories:
return Path("common/warmahordes rules/spells")
if "animus" in categories:
return Path("common/warmahordes rules/animi")
if "model ability" in categories:
return Path("common/warmahordes rules/model advantages")
if "weapon ability" in categories:
return Path("common/warmahordes rules/weapon qualities")
# MODELS
if "model" in categories:
faction = detect_faction(categories)
model_type = detect_model_type(categories)
if faction and model_type:
system = (
"warmachine"
if faction
not in {
"trollblood",
"circle orboros",
"skorne",
"legion of everblight",
"grymkin",
"minion",
}
else "hordes"
)
return Path(f"{system}/{faction}/{model_type}")
# THEME FORCE
if "theme force" in categories:
faction = detect_faction(categories)
if faction:
system = (
"warmachine"
if faction
not in {
"trollblood",
"circle orboros",
"skorne",
"legion of everblight",
"grymkin",
"minion",
}
else "hordes"
)
return Path(f"{system}/{faction}/theme forces")
# fallback
return Path("common/others")
# ---------- main ----------
files = list(INPUT_DIR.glob("*.html"))
print(f"{len(files)} pages à classifier")
for file_path in files:
html = file_path.read_text(encoding="utf-8", errors="ignore")
categories = extract_categories(html)
dest = OUTPUT_DIR / classify(categories)
dest.mkdir(parents=True, exist_ok=True)
shutil.copy2(file_path, dest / file_path.name)
print("✅ Classification terminée")