tweak paths and find links

2026-04-03 15:50:40 +02:00 · 2026-04-03 15:50:40 +02:00 · e4aaa33137
commit e4aaa33137
parent 36c8bb2354
7 changed files with 309 additions and 10 deletions
--- a/analyze_categories.py
+++ b/analyze_categories.py
@ -1,8 +1,8 @@
 import json
 from collections import defaultdict

-INPUT_FILE = "categories.json"
-OUTPUT_FILE = "categories_analysis.json"
+INPUT_FILE = "../categories.json"
+OUTPUT_FILE = "../categories_analysis.json"

 # patterns typiques MediaWiki bruit
 IGNORE_PATTERNS = [
--- a/build_link_registry.py
+++ b/build_link_registry.py
@ -0,0 +1,177 @@
+from pathlib import Path
+import json
+import re
+from bs4 import BeautifulSoup
+
+# ======================
+# CONFIG
+# ======================
+
+INPUT_DIR = Path("../unique_pages")
+OUTPUT_DIR = Path("../link_registry")
+
+OUTPUT_DIR.mkdir(exist_ok=True)
+
+# MediaWiki-like prefixes
+PREFIXES = [
+    "category:",
+    "category_",
+    "file:",
+    "template:",
+]
+
+# ======================
+# HELPERS
+# ======================
+
+def normalize(title: str) -> str:
+    """Canonical key normalization."""
+    if not title:
+        return ""
+
+    title = title.strip()
+    title = title.replace("_", " ")
+    title = re.sub(r"\s+", " ", title)
+
+    return title.lower()
+
+
+def slugify(title: str) -> str:
+    return normalize(title).replace(" ", "_")
+
+
+def strip_prefix(title: str) -> str:
+    t = title.lower()
+    for p in PREFIXES:
+        if t.startswith(p):
+            return title[len(p):]
+    return title
+
+
+def extract_title(soup: BeautifulSoup):
+    """Try multiple strategies to extract page title."""
+
+    # Strategy 1: MediaWiki heading
+    h1 = soup.find("h1", id="firstHeading")
+    if h1:
+        return h1.get_text(strip=True)
+
+    # Strategy 2: HTML title
+    if soup.title:
+        title = soup.title.get_text()
+        if "-" in title:
+            return title.split("-")[0].strip()
+        return title.strip()
+
+    return None
+
+
+def detect_redirect(soup: BeautifulSoup):
+    """Detect MediaWiki redirect pages."""
+    text = soup.get_text(" ", strip=True).lower()
+
+    # HTTrack redirects often contain this
+    if "#redirect" in text:
+        link = soup.find("a")
+        if link and link.get("href"):
+            return link["href"]
+
+    # alternative pattern
+    redirect_note = soup.find(class_="redirectText")
+    if redirect_note:
+        link = redirect_note.find("a")
+        if link:
+            return link.get("href")
+
+    return None
+
+
+# ======================
+# MAIN
+# ======================
+
+title_registry = {}
+alias_registry = {}
+redirects = {}
+unresolved = []
+
+files = list(INPUT_DIR.glob("*.html"))
+print(f"{len(files)} fichiers trouvés")
+
+for i, file_path in enumerate(files, 1):
+
+    try:
+        html = file_path.read_text(encoding="utf-8", errors="ignore")
+        soup = BeautifulSoup(html, "html.parser")
+
+        title = extract_title(soup)
+
+        if not title:
+            unresolved.append({
+                "file": file_path.name,
+                "reason": "no_title_found"
+            })
+            continue
+
+        key = slugify(title)
+
+        # register canonical page
+        title_registry[key] = {
+            "title": title,
+            "file": str(file_path)
+        }
+
+        # detect redirect
+        redirect_href = detect_redirect(soup)
+        if redirect_href:
+            target = Path(redirect_href).stem
+            target_key = slugify(target)
+
+            redirects[key] = target_key
+            alias_registry[key] = target_key
+
+    except Exception as e:
+        unresolved.append({
+            "file": file_path.name,
+            "reason": str(e)
+        })
+
+    if i % 100 == 0:
+        print(f"{i}/{len(files)} traités")
+
+# ======================
+# AUTO ALIAS GENERATION
+# ======================
+
+auto_alias_count = 0
+
+for key in list(title_registry.keys()):
+    stripped = slugify(strip_prefix(key))
+
+    if stripped != key and stripped in title_registry:
+        alias_registry[key] = stripped
+        auto_alias_count += 1
+
+print(f"Alias automatiques ajoutés: {auto_alias_count}")
+
+# ======================
+# SAVE FILES
+# ======================
+
+with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f:
+    json.dump(title_registry, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f:
+    json.dump(alias_registry, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f:
+    json.dump(redirects, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f:
+    json.dump(unresolved, f, indent=2, ensure_ascii=False)
+
+print("\n✅ REGISTRY BUILD COMPLETE")
+print(f"Pages uniques: {len(title_registry)}")
+print(f"Alias: {len(alias_registry)}")
+print(f"Redirects: {len(redirects)}")
+print(f"Problèmes: {len(unresolved)}")
--- a/category_graph_analysis.py
+++ b/category_graph_analysis.py
@ -3,8 +3,8 @@ import re
 from pathlib import Path
 from collections import defaultdict, Counter

-INPUT_DIR = Path(".")  # dossier contenant les fichiers
-OUTPUT_FILE = "category_analysis.json"
+INPUT_DIR = Path("../unique_pages")
+OUTPUT_FILE = "../category_analysis.json"


 # ---------------------------
--- a/extract_categories.py
+++ b/extract_categories.py
@ -7,8 +7,8 @@ from collections import defaultdict
 # CONFIG
 # =========================

-INPUT_DIR = "."  # dossier contenant les 700 fichiers
-OUTPUT_FILE = "categories.json"
+INPUT_DIR = "../original_index"  # dossier contenant les 700 fichiers
+OUTPUT_FILE = "../categories.json"

 # extensions à analyser
 VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"}
--- a/remove_duplicate_pages.py
+++ b/remove_duplicate_pages.py
@ -4,8 +4,8 @@ import re
 import json


-INPUT_DIR = Path(".")
-OUTPUT_DIR = Path("unique_pages")
+INPUT_DIR = Path("../original_index")
+OUTPUT_DIR = Path("../unique_pages")
 OUTPUT_DIR.mkdir(exist_ok=True)

 ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
--- a/scan_internal_links.py
+++ b/scan_internal_links.py
@ -0,0 +1,122 @@
+from pathlib import Path
+import json
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+
+INPUT_DIR = Path("../unique_pages")
+REGISTRY_DIR = Path("../link_registry")
+
+title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
+alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
+
+OUTPUT_RESOLVED = []
+OUTPUT_UNRESOLVED = []
+
+# ======================
+# HELPERS
+# ======================
+
+def normalize_href(href: str):
+    if not href:
+        return None
+
+    # ignore external links
+    if href.startswith("http"):
+        return None
+
+    name = Path(href).stem
+    return name.lower()
+
+
+def resolve(name):
+    if name in title_registry:
+        return name
+
+    if name in alias_registry:
+        return alias_registry[name]
+
+    # try removing category prefix
+    if name.startswith("category_"):
+        alt = name.replace("category_", "", 1)
+        if alt in title_registry:
+            return alt
+
+    return None
+
+
+def extract_article_links(soup):
+
+    content = soup.find("div", id="mw-content-text")
+    if not content:
+        return []
+
+    links = []
+
+    for a in content.find_all("a", href=True):
+
+        href = a["href"]
+
+        # ignore anchors
+        if href.startswith("#"):
+            continue
+
+        # ignore files/images/history/etc
+        if any(prefix in href.lower() for prefix in [
+            "file_",
+            "image:",
+            "special:",
+            "action=",
+        ]):
+            continue
+
+        links.append(href)
+
+    return links
+
+
+# ======================
+# MAIN
+# ======================
+
+files = list(INPUT_DIR.glob("*.html"))
+print(f"{len(files)} pages à analyser")
+
+for i, file_path in enumerate(files, 1):
+
+    html = file_path.read_text(encoding="utf-8", errors="ignore")
+    soup = BeautifulSoup(html, "html.parser")
+
+    links = extract_article_links(soup)
+
+    for href in links:
+
+        key = normalize_href(href)
+        if not key:
+            continue
+
+        resolved = resolve(key)
+
+        entry = {
+            "source": file_path.name,
+            "link": href,
+        }
+
+        if resolved:
+            entry["target"] = resolved
+            OUTPUT_RESOLVED.append(entry)
+        else:
+            OUTPUT_UNRESOLVED.append(entry)
+
+    if i % 100 == 0:
+        print(f"{i}/{len(files)} analysées")
+
+# ======================
+# SAVE
+# ======================
+
+json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
+json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
+
+print("\n✅ LINK SCAN COMPLETE")
+print("Resolved:", len(OUTPUT_RESOLVED))
+print("Unresolved:", len(OUTPUT_UNRESOLVED))
--- a/sort_pages.py
+++ b/sort_pages.py
@ -3,8 +3,8 @@ import shutil
 import re
 import json

-INPUT_DIR = Path("unique_pages")
-OUTPUT_DIR = Path("classified_pages")
+INPUT_DIR = Path("../unique_pages")
+OUTPUT_DIR = Path("../classified_pages")

 CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')