first experiments
This commit is contained in:
commit
36c8bb2354
5 changed files with 542 additions and 0 deletions
63
analyze_categories.py
Normal file
63
analyze_categories.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
INPUT_FILE = "categories.json"
|
||||
OUTPUT_FILE = "categories_analysis.json"
|
||||
|
||||
# patterns typiques MediaWiki bruit
|
||||
IGNORE_PATTERNS = [
|
||||
"pages using",
|
||||
"articles needing",
|
||||
"redirect",
|
||||
"template",
|
||||
"tracking",
|
||||
"with broken",
|
||||
"cleanup",
|
||||
"maintenance",
|
||||
]
|
||||
|
||||
|
||||
def classify(name, count):
|
||||
lname = name.lower()
|
||||
|
||||
# technique
|
||||
if any(p in lname for p in IGNORE_PATTERNS):
|
||||
return "technical"
|
||||
|
||||
# singleton = probablement page miroir
|
||||
if count == 1:
|
||||
return "singleton"
|
||||
|
||||
if count <= 3:
|
||||
return "rare"
|
||||
|
||||
if count < 20:
|
||||
return "medium"
|
||||
|
||||
return "high"
|
||||
|
||||
|
||||
def main():
|
||||
with open(INPUT_FILE, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
groups = defaultdict(list)
|
||||
|
||||
for cat in data["categories"]:
|
||||
name = cat["name"]
|
||||
count = cat["count"]
|
||||
|
||||
group = classify(name, count)
|
||||
|
||||
groups[group].append({"name": name, "count": count})
|
||||
|
||||
output = {"summary": {k: len(v) for k, v in groups.items()}, "groups": groups}
|
||||
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print("Analyse terminée →", OUTPUT_FILE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
88
category_graph_analysis.py
Normal file
88
category_graph_analysis.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
INPUT_DIR = Path(".") # dossier contenant les fichiers
|
||||
OUTPUT_FILE = "category_analysis.json"
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# Extraction wgCategories
|
||||
# ---------------------------
|
||||
WG_RE = re.compile(r'wgCategories"\s*:\s*(\[[^\]]*\])', re.DOTALL)
|
||||
|
||||
|
||||
def extract_categories(text: str):
|
||||
match = WG_RE.search(text)
|
||||
if not match:
|
||||
return []
|
||||
|
||||
try:
|
||||
raw = json.loads(match.group(1))
|
||||
return [c.strip() for c in raw if isinstance(c, str)]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# Analyse globale
|
||||
# ---------------------------
|
||||
category_pages = Counter()
|
||||
category_neighbors = defaultdict(set)
|
||||
|
||||
files = list(INPUT_DIR.glob("**/*.html"))
|
||||
|
||||
print(f"{len(files)} fichiers trouvés")
|
||||
|
||||
for file in files:
|
||||
if not file.is_file():
|
||||
continue
|
||||
|
||||
try:
|
||||
text = file.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
categories = extract_categories(text)
|
||||
|
||||
# IMPORTANT : éviter les doublons dans une même page
|
||||
categories = list(set(categories))
|
||||
|
||||
if not categories:
|
||||
continue
|
||||
|
||||
# compter occurrences
|
||||
for cat in categories:
|
||||
category_pages[cat] += 1
|
||||
|
||||
# co-occurrences
|
||||
for cat in categories:
|
||||
others = set(categories)
|
||||
others.remove(cat)
|
||||
category_neighbors[cat].update(others)
|
||||
|
||||
print("Total occurrences:", sum(category_pages.values()))
|
||||
print("Unique categories:", len(category_pages))
|
||||
print("Singletons:", sum(1 for c in category_pages.values() if c == 1))
|
||||
# ---------------------------
|
||||
# Construction résultat
|
||||
# ---------------------------
|
||||
result = []
|
||||
|
||||
for cat in category_pages:
|
||||
result.append(
|
||||
{
|
||||
"name": cat,
|
||||
"page_count": category_pages[cat],
|
||||
"neighbor_count": len(category_neighbors[cat]),
|
||||
}
|
||||
)
|
||||
|
||||
# tri utile pour analyse
|
||||
result.sort(key=lambda x: (-x["page_count"], x["name"]))
|
||||
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"OK → {len(result)} catégories écrites dans {OUTPUT_FILE}")
|
||||
134
extract_categories.py
Normal file
134
extract_categories.py
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
# =========================
|
||||
# CONFIG
|
||||
# =========================
|
||||
|
||||
INPUT_DIR = "." # dossier contenant les 700 fichiers
|
||||
OUTPUT_FILE = "categories.json"
|
||||
|
||||
# extensions à analyser
|
||||
VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"}
|
||||
|
||||
# =========================
|
||||
# REGEX PATTERNS
|
||||
# =========================
|
||||
|
||||
# [[Category:Something]]
|
||||
CATEGORY_WIKI_RE = re.compile(r"\[\[\s*Category\s*:\s*([^\|\]]+)", re.IGNORECASE)
|
||||
|
||||
# wgCategories:["A","B"]
|
||||
WG_CATEGORIES_RE = re.compile(r'wgCategories"\s*:\s*\[(.*?)\]', re.DOTALL)
|
||||
|
||||
# "Category name" inside wgCategories
|
||||
WG_CATEGORY_ITEM_RE = re.compile(r'"([^"]+)"')
|
||||
|
||||
# HTML catlinks fallback
|
||||
HTML_CATEGORY_RE = re.compile(r"<li>\s*<a[^>]*>(.*?)</a>", re.IGNORECASE)
|
||||
|
||||
# =========================
|
||||
# EXTRACTION
|
||||
# =========================
|
||||
|
||||
|
||||
def extract_categories_from_text(text: str):
|
||||
found = set()
|
||||
|
||||
# --- MediaWiki [[Category:...]]
|
||||
for match in CATEGORY_WIKI_RE.findall(text):
|
||||
found.add(match.strip())
|
||||
|
||||
# --- wgCategories JS block
|
||||
wg_match = WG_CATEGORIES_RE.search(text)
|
||||
if wg_match:
|
||||
block = wg_match.group(1)
|
||||
for cat in WG_CATEGORY_ITEM_RE.findall(block):
|
||||
found.add(cat.strip())
|
||||
|
||||
# --- HTML fallback
|
||||
for match in HTML_CATEGORY_RE.findall(text):
|
||||
if match.lower() != "categories":
|
||||
found.add(match.strip())
|
||||
|
||||
return found
|
||||
|
||||
|
||||
# =========================
|
||||
# MAIN SCAN
|
||||
# =========================
|
||||
|
||||
|
||||
def scan_directory(path: Path):
|
||||
categories_count = defaultdict(int)
|
||||
scanned_files = 0
|
||||
|
||||
for file in path.rglob("*"):
|
||||
if file.suffix.lower() not in VALID_EXTENSIONS:
|
||||
continue
|
||||
|
||||
try:
|
||||
text = file.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception as e:
|
||||
print(f"[SKIP] {file} ({e})")
|
||||
continue
|
||||
|
||||
cats = extract_categories_from_text(text)
|
||||
|
||||
for cat in cats:
|
||||
categories_count[cat] += 1
|
||||
|
||||
scanned_files += 1
|
||||
|
||||
if scanned_files % 50 == 0:
|
||||
print(f"{scanned_files} fichiers analysés...")
|
||||
|
||||
return categories_count
|
||||
|
||||
|
||||
# =========================
|
||||
# EXPORT
|
||||
# =========================
|
||||
|
||||
|
||||
def export_json(categories_count):
|
||||
data = {
|
||||
"total_unique_categories": len(categories_count),
|
||||
"categories": sorted(
|
||||
[
|
||||
{"name": name, "count": count}
|
||||
for name, count in categories_count.items()
|
||||
],
|
||||
key=lambda x: (-x["count"], x["name"].lower()),
|
||||
),
|
||||
}
|
||||
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n✅ Export terminé → {OUTPUT_FILE}")
|
||||
print(f"{len(categories_count)} catégories uniques trouvées.")
|
||||
|
||||
|
||||
# =========================
|
||||
# ENTRYPOINT
|
||||
# =========================
|
||||
|
||||
|
||||
def main():
|
||||
path = Path(INPUT_DIR)
|
||||
|
||||
if not path.exists():
|
||||
print("Dossier introuvable :", INPUT_DIR)
|
||||
return
|
||||
|
||||
print("Analyse des fichiers...\n")
|
||||
|
||||
categories_count = scan_directory(path)
|
||||
export_json(categories_count)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
101
remove_duplicate_pages.py
Normal file
101
remove_duplicate_pages.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
from pathlib import Path
|
||||
import shutil
|
||||
import re
|
||||
import json
|
||||
|
||||
|
||||
INPUT_DIR = Path(".")
|
||||
OUTPUT_DIR = Path("unique_pages")
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
|
||||
REDIRECT_RE = re.compile(r'"wgIsRedirect":\s*(true|false)')
|
||||
PAGENAME_RE = re.compile(r'"wgPageName":"([^"]+)"')
|
||||
INVALID_CHARS = r'[<>:"/\\|?*]'
|
||||
RESERVED_NAMES = {
|
||||
"CON",
|
||||
"PRN",
|
||||
"AUX",
|
||||
"NUL",
|
||||
*(f"COM{i}" for i in range(1, 10)),
|
||||
*(f"LPT{i}" for i in range(1, 10)),
|
||||
}
|
||||
|
||||
|
||||
def sanitize_filename(name: str) -> str:
|
||||
# forbidden chars
|
||||
name = re.sub(INVALID_CHARS, "_", name)
|
||||
|
||||
# spaces → underscore
|
||||
name = name.replace(" ", "_")
|
||||
|
||||
# remove trailing dots/spaces
|
||||
name = name.rstrip(". ")
|
||||
|
||||
# Windows reserved names
|
||||
if name.upper() in RESERVED_NAMES:
|
||||
name = "_" + name
|
||||
|
||||
return name
|
||||
|
||||
|
||||
articles = {}
|
||||
print("start parsing files")
|
||||
|
||||
for file_path in INPUT_DIR.glob("*.html"):
|
||||
|
||||
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
|
||||
# Article ID
|
||||
m = ARTICLE_ID_RE.search(html)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
article_id = int(m.group(1))
|
||||
if article_id == 0:
|
||||
continue
|
||||
|
||||
# Redirect
|
||||
m = REDIRECT_RE.search(html)
|
||||
is_redirect = bool(m and m.group(1) == "true")
|
||||
|
||||
# Canonical page name
|
||||
m = PAGENAME_RE.search(html)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
# Decode MediaWiki unicode escapes
|
||||
raw_name = m.group(1)
|
||||
page_name = json.loads(f'"{raw_name}"')
|
||||
|
||||
# Sanitize filename
|
||||
clean_name = page_name.replace("Category:", "")
|
||||
clean_name = sanitize_filename(clean_name)
|
||||
|
||||
filename = clean_name + ".html"
|
||||
|
||||
# Selection logic
|
||||
if article_id not in articles:
|
||||
articles[article_id] = {
|
||||
"path": file_path,
|
||||
"redirect": is_redirect,
|
||||
"filename": filename,
|
||||
}
|
||||
else:
|
||||
# Avoid redirect
|
||||
if articles[article_id]["redirect"] and not is_redirect:
|
||||
articles[article_id] = {
|
||||
"path": file_path,
|
||||
"redirect": is_redirect,
|
||||
"filename": filename,
|
||||
}
|
||||
|
||||
# Copy
|
||||
print("start copying files")
|
||||
for art in articles.values():
|
||||
dst = OUTPUT_DIR / art["filename"]
|
||||
try:
|
||||
shutil.copy2(art["path"], dst)
|
||||
except OSError as e:
|
||||
print("❌ Copy failed:", art["filename"], e)
|
||||
print(f"✅ Unique pages kept: {len(articles)}")
|
||||
156
sort_pages.py
Normal file
156
sort_pages.py
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
from pathlib import Path
|
||||
import shutil
|
||||
import re
|
||||
import json
|
||||
|
||||
INPUT_DIR = Path("unique_pages")
|
||||
OUTPUT_DIR = Path("classified_pages")
|
||||
|
||||
CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')
|
||||
|
||||
# ---------- CONFIG ----------
|
||||
|
||||
FACTIONS = {
|
||||
"cygnar",
|
||||
"khador",
|
||||
"protectorate of menoth",
|
||||
"cryx",
|
||||
"retribution of scyrah",
|
||||
"convergence of cyriss",
|
||||
"cephalyx",
|
||||
"mercenary",
|
||||
"crucible guard",
|
||||
"infernals",
|
||||
"trollblood",
|
||||
"circle orboros",
|
||||
"skorne",
|
||||
"legion of everblight",
|
||||
"grymkin",
|
||||
"minion",
|
||||
}
|
||||
|
||||
MODEL_TYPES = {
|
||||
"warcaster unit": "warcaster unit",
|
||||
"warlock unit": "warlock unit",
|
||||
"warcaster": "warcaster",
|
||||
"warlock": "warlocks",
|
||||
"infernal master": "masters",
|
||||
"warjack": "warjacks",
|
||||
"warbeast": "warbeasts",
|
||||
"horror": "horrors",
|
||||
"monstrosity": "monstrosities",
|
||||
"solo": "solos",
|
||||
"unit": "units",
|
||||
"battle engine": "battle engines",
|
||||
"structure": "structure",
|
||||
}
|
||||
|
||||
# ---------- helpers ----------
|
||||
|
||||
|
||||
def extract_categories(html: str):
|
||||
m = CATEGORY_RE.search(html)
|
||||
if not m:
|
||||
return []
|
||||
|
||||
raw = "[" + m.group(1) + "]"
|
||||
try:
|
||||
return [c.lower() for c in json.loads(raw)]
|
||||
except:
|
||||
return []
|
||||
|
||||
|
||||
def detect_faction(categories):
|
||||
for c in categories:
|
||||
if c in FACTIONS:
|
||||
return c
|
||||
return None
|
||||
|
||||
|
||||
def detect_model_type(categories):
|
||||
for c, folder in MODEL_TYPES.items():
|
||||
if c in categories:
|
||||
return folder
|
||||
return None
|
||||
|
||||
|
||||
# ---------- classification ----------
|
||||
|
||||
|
||||
def classify(categories):
|
||||
|
||||
# SPELLS
|
||||
if "spell" in categories:
|
||||
return Path("common/warmahordes rules/spells")
|
||||
|
||||
if "animus" in categories:
|
||||
return Path("common/warmahordes rules/animi")
|
||||
|
||||
if "model ability" in categories:
|
||||
return Path("common/warmahordes rules/model advantages")
|
||||
|
||||
if "weapon ability" in categories:
|
||||
return Path("common/warmahordes rules/weapon qualities")
|
||||
|
||||
# MODELS
|
||||
if "model" in categories:
|
||||
faction = detect_faction(categories)
|
||||
model_type = detect_model_type(categories)
|
||||
|
||||
if faction and model_type:
|
||||
system = (
|
||||
"warmachine"
|
||||
if faction
|
||||
not in {
|
||||
"trollblood",
|
||||
"circle orboros",
|
||||
"skorne",
|
||||
"legion of everblight",
|
||||
"grymkin",
|
||||
"minion",
|
||||
}
|
||||
else "hordes"
|
||||
)
|
||||
|
||||
return Path(f"{system}/{faction}/{model_type}")
|
||||
|
||||
# THEME FORCE
|
||||
if "theme force" in categories:
|
||||
faction = detect_faction(categories)
|
||||
if faction:
|
||||
system = (
|
||||
"warmachine"
|
||||
if faction
|
||||
not in {
|
||||
"trollblood",
|
||||
"circle orboros",
|
||||
"skorne",
|
||||
"legion of everblight",
|
||||
"grymkin",
|
||||
"minion",
|
||||
}
|
||||
else "hordes"
|
||||
)
|
||||
|
||||
return Path(f"{system}/{faction}/theme forces")
|
||||
|
||||
# fallback
|
||||
return Path("common/others")
|
||||
|
||||
|
||||
# ---------- main ----------
|
||||
|
||||
files = list(INPUT_DIR.glob("*.html"))
|
||||
print(f"{len(files)} pages à classifier")
|
||||
|
||||
for file_path in files:
|
||||
|
||||
html = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
categories = extract_categories(html)
|
||||
|
||||
dest = OUTPUT_DIR / classify(categories)
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
shutil.copy2(file_path, dest / file_path.name)
|
||||
|
||||
print("✅ Classification terminée")
|
||||
Loading…
Add table
Add a link
Reference in a new issue