From e4aaa33137f4f6df3a4aa7a76656c7f32d7381fa Mon Sep 17 00:00:00 2001
From: maximator <bbq974@hotmail.fr>
Date: Fri, 3 Apr 2026 15:50:40 +0200
Subject: [PATCH] tweak paths and find links

---
 analyze_categories.py      |   4 +-
 build_link_registry.py     | 177 +++++++++++++++++++++++++++++++++++++
 category_graph_analysis.py |   4 +-
 extract_categories.py      |   4 +-
 remove_duplicate_pages.py  |   4 +-
 scan_internal_links.py     | 122 +++++++++++++++++++++++++
 sort_pages.py              |   4 +-
 7 files changed, 309 insertions(+), 10 deletions(-)
 create mode 100644 build_link_registry.py
 create mode 100644 scan_internal_links.py

diff --git a/analyze_categories.py b/analyze_categories.py
index 70530e8..10c5b3c 100644
--- a/analyze_categories.py
+++ b/analyze_categories.py
@@ -1,8 +1,8 @@
 import json
 from collections import defaultdict
 
-INPUT_FILE = "categories.json"
-OUTPUT_FILE = "categories_analysis.json"
+INPUT_FILE = "../categories.json"
+OUTPUT_FILE = "../categories_analysis.json"
 
 # patterns typiques MediaWiki bruit
 IGNORE_PATTERNS = [
diff --git a/build_link_registry.py b/build_link_registry.py
new file mode 100644
index 0000000..f65e7e4
--- /dev/null
+++ b/build_link_registry.py
@@ -0,0 +1,177 @@
+from pathlib import Path
+import json
+import re
+from bs4 import BeautifulSoup
+
+# ======================
+# CONFIG
+# ======================
+
+INPUT_DIR = Path("../unique_pages")
+OUTPUT_DIR = Path("../link_registry")
+
+OUTPUT_DIR.mkdir(exist_ok=True)
+
+# MediaWiki-like prefixes
+PREFIXES = [
+    "category:",
+    "category_",
+    "file:",
+    "template:",
+]
+
+# ======================
+# HELPERS
+# ======================
+
+def normalize(title: str) -> str:
+    """Canonical key normalization."""
+    if not title:
+        return ""
+
+    title = title.strip()
+    title = title.replace("_", " ")
+    title = re.sub(r"\s+", " ", title)
+
+    return title.lower()
+
+
+def slugify(title: str) -> str:
+    return normalize(title).replace(" ", "_")
+
+
+def strip_prefix(title: str) -> str:
+    t = title.lower()
+    for p in PREFIXES:
+        if t.startswith(p):
+            return title[len(p):]
+    return title
+
+
+def extract_title(soup: BeautifulSoup):
+    """Try multiple strategies to extract page title."""
+
+    # Strategy 1: MediaWiki heading
+    h1 = soup.find("h1", id="firstHeading")
+    if h1:
+        return h1.get_text(strip=True)
+
+    # Strategy 2: HTML title
+    if soup.title:
+        title = soup.title.get_text()
+        if "-" in title:
+            return title.split("-")[0].strip()
+        return title.strip()
+
+    return None
+
+
+def detect_redirect(soup: BeautifulSoup):
+    """Detect MediaWiki redirect pages."""
+    text = soup.get_text(" ", strip=True).lower()
+
+    # HTTrack redirects often contain this
+    if "#redirect" in text:
+        link = soup.find("a")
+        if link and link.get("href"):
+            return link["href"]
+
+    # alternative pattern
+    redirect_note = soup.find(class_="redirectText")
+    if redirect_note:
+        link = redirect_note.find("a")
+        if link:
+            return link.get("href")
+
+    return None
+
+
+# ======================
+# MAIN
+# ======================
+
+title_registry = {}
+alias_registry = {}
+redirects = {}
+unresolved = []
+
+files = list(INPUT_DIR.glob("*.html"))
+print(f"{len(files)} fichiers trouvés")
+
+for i, file_path in enumerate(files, 1):
+
+    try:
+        html = file_path.read_text(encoding="utf-8", errors="ignore")
+        soup = BeautifulSoup(html, "html.parser")
+
+        title = extract_title(soup)
+
+        if not title:
+            unresolved.append({
+                "file": file_path.name,
+                "reason": "no_title_found"
+            })
+            continue
+
+        key = slugify(title)
+
+        # register canonical page
+        title_registry[key] = {
+            "title": title,
+            "file": str(file_path)
+        }
+
+        # detect redirect
+        redirect_href = detect_redirect(soup)
+        if redirect_href:
+            target = Path(redirect_href).stem
+            target_key = slugify(target)
+
+            redirects[key] = target_key
+            alias_registry[key] = target_key
+
+    except Exception as e:
+        unresolved.append({
+            "file": file_path.name,
+            "reason": str(e)
+        })
+
+    if i % 100 == 0:
+        print(f"{i}/{len(files)} traités")
+
+# ======================
+# AUTO ALIAS GENERATION
+# ======================
+
+auto_alias_count = 0
+
+for key in list(title_registry.keys()):
+    stripped = slugify(strip_prefix(key))
+
+    if stripped != key and stripped in title_registry:
+        alias_registry[key] = stripped
+        auto_alias_count += 1
+
+print(f"Alias automatiques ajoutés: {auto_alias_count}")
+
+# ======================
+# SAVE FILES
+# ======================
+
+with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f:
+    json.dump(title_registry, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f:
+    json.dump(alias_registry, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f:
+    json.dump(redirects, f, indent=2, ensure_ascii=False)
+
+with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f:
+    json.dump(unresolved, f, indent=2, ensure_ascii=False)
+
+print("\n✅ REGISTRY BUILD COMPLETE")
+print(f"Pages uniques: {len(title_registry)}")
+print(f"Alias: {len(alias_registry)}")
+print(f"Redirects: {len(redirects)}")
+print(f"Problèmes: {len(unresolved)}")
\ No newline at end of file
diff --git a/category_graph_analysis.py b/category_graph_analysis.py
index 35f97ba..2bc5034 100644
--- a/category_graph_analysis.py
+++ b/category_graph_analysis.py
@@ -3,8 +3,8 @@ import re
 from pathlib import Path
 from collections import defaultdict, Counter
 
-INPUT_DIR = Path(".")  # dossier contenant les fichiers
-OUTPUT_FILE = "category_analysis.json"
+INPUT_DIR = Path("../unique_pages")
+OUTPUT_FILE = "../category_analysis.json"
 
 
 # ---------------------------
diff --git a/extract_categories.py b/extract_categories.py
index bd1ef55..814dca9 100644
--- a/extract_categories.py
+++ b/extract_categories.py
@@ -7,8 +7,8 @@ from collections import defaultdict
 # CONFIG
 # =========================
 
-INPUT_DIR = "."  # dossier contenant les 700 fichiers
-OUTPUT_FILE = "categories.json"
+INPUT_DIR = "../original_index"  # dossier contenant les 700 fichiers
+OUTPUT_FILE = "../categories.json"
 
 # extensions à analyser
 VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"}
diff --git a/remove_duplicate_pages.py b/remove_duplicate_pages.py
index 20e5d8c..06f9413 100644
--- a/remove_duplicate_pages.py
+++ b/remove_duplicate_pages.py
@@ -4,8 +4,8 @@ import re
 import json
 
 
-INPUT_DIR = Path(".")
-OUTPUT_DIR = Path("unique_pages")
+INPUT_DIR = Path("../original_index")
+OUTPUT_DIR = Path("../unique_pages")
 OUTPUT_DIR.mkdir(exist_ok=True)
 
 ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')
diff --git a/scan_internal_links.py b/scan_internal_links.py
new file mode 100644
index 0000000..e2a64c9
--- /dev/null
+++ b/scan_internal_links.py
@@ -0,0 +1,122 @@
+from pathlib import Path
+import json
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+
+INPUT_DIR = Path("../unique_pages")
+REGISTRY_DIR = Path("../link_registry")
+
+title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
+alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
+
+OUTPUT_RESOLVED = []
+OUTPUT_UNRESOLVED = []
+
+# ======================
+# HELPERS
+# ======================
+
+def normalize_href(href: str):
+    if not href:
+        return None
+
+    # ignore external links
+    if href.startswith("http"):
+        return None
+
+    name = Path(href).stem
+    return name.lower()
+
+
+def resolve(name):
+    if name in title_registry:
+        return name
+
+    if name in alias_registry:
+        return alias_registry[name]
+
+    # try removing category prefix
+    if name.startswith("category_"):
+        alt = name.replace("category_", "", 1)
+        if alt in title_registry:
+            return alt
+
+    return None
+
+
+def extract_article_links(soup):
+
+    content = soup.find("div", id="mw-content-text")
+    if not content:
+        return []
+
+    links = []
+
+    for a in content.find_all("a", href=True):
+
+        href = a["href"]
+
+        # ignore anchors
+        if href.startswith("#"):
+            continue
+
+        # ignore files/images/history/etc
+        if any(prefix in href.lower() for prefix in [
+            "file_",
+            "image:",
+            "special:",
+            "action=",
+        ]):
+            continue
+
+        links.append(href)
+
+    return links
+
+
+# ======================
+# MAIN
+# ======================
+
+files = list(INPUT_DIR.glob("*.html"))
+print(f"{len(files)} pages à analyser")
+
+for i, file_path in enumerate(files, 1):
+
+    html = file_path.read_text(encoding="utf-8", errors="ignore")
+    soup = BeautifulSoup(html, "html.parser")
+
+    links = extract_article_links(soup)
+
+    for href in links:
+
+        key = normalize_href(href)
+        if not key:
+            continue
+
+        resolved = resolve(key)
+
+        entry = {
+            "source": file_path.name,
+            "link": href,
+        }
+
+        if resolved:
+            entry["target"] = resolved
+            OUTPUT_RESOLVED.append(entry)
+        else:
+            OUTPUT_UNRESOLVED.append(entry)
+
+    if i % 100 == 0:
+        print(f"{i}/{len(files)} analysées")
+
+# ======================
+# SAVE
+# ======================
+
+json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
+json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
+
+print("\n✅ LINK SCAN COMPLETE")
+print("Resolved:", len(OUTPUT_RESOLVED))
+print("Unresolved:", len(OUTPUT_UNRESOLVED))
\ No newline at end of file
diff --git a/sort_pages.py b/sort_pages.py
index 59eeaef..bd1b1de 100644
--- a/sort_pages.py
+++ b/sort_pages.py
@@ -3,8 +3,8 @@ import shutil
 import re
 import json
 
-INPUT_DIR = Path("unique_pages")
-OUTPUT_DIR = Path("classified_pages")
+INPUT_DIR = Path("../unique_pages")
+OUTPUT_DIR = Path("../classified_pages")
 
 CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')