From e4aaa33137f4f6df3a4aa7a76656c7f32d7381fa Mon Sep 17 00:00:00 2001 From: maximator Date: Fri, 3 Apr 2026 15:50:40 +0200 Subject: [PATCH] tweak paths and find links --- analyze_categories.py | 4 +- build_link_registry.py | 177 +++++++++++++++++++++++++++++++++++++ category_graph_analysis.py | 4 +- extract_categories.py | 4 +- remove_duplicate_pages.py | 4 +- scan_internal_links.py | 122 +++++++++++++++++++++++++ sort_pages.py | 4 +- 7 files changed, 309 insertions(+), 10 deletions(-) create mode 100644 build_link_registry.py create mode 100644 scan_internal_links.py diff --git a/analyze_categories.py b/analyze_categories.py index 70530e8..10c5b3c 100644 --- a/analyze_categories.py +++ b/analyze_categories.py @@ -1,8 +1,8 @@ import json from collections import defaultdict -INPUT_FILE = "categories.json" -OUTPUT_FILE = "categories_analysis.json" +INPUT_FILE = "../categories.json" +OUTPUT_FILE = "../categories_analysis.json" # patterns typiques MediaWiki bruit IGNORE_PATTERNS = [ diff --git a/build_link_registry.py b/build_link_registry.py new file mode 100644 index 0000000..f65e7e4 --- /dev/null +++ b/build_link_registry.py @@ -0,0 +1,177 @@ +from pathlib import Path +import json +import re +from bs4 import BeautifulSoup + +# ====================== +# CONFIG +# ====================== + +INPUT_DIR = Path("../unique_pages") +OUTPUT_DIR = Path("../link_registry") + +OUTPUT_DIR.mkdir(exist_ok=True) + +# MediaWiki-like prefixes +PREFIXES = [ + "category:", + "category_", + "file:", + "template:", +] + +# ====================== +# HELPERS +# ====================== + +def normalize(title: str) -> str: + """Canonical key normalization.""" + if not title: + return "" + + title = title.strip() + title = title.replace("_", " ") + title = re.sub(r"\s+", " ", title) + + return title.lower() + + +def slugify(title: str) -> str: + return normalize(title).replace(" ", "_") + + +def strip_prefix(title: str) -> str: + t = title.lower() + for p in PREFIXES: + if t.startswith(p): + return title[len(p):] + return title + + +def extract_title(soup: BeautifulSoup): + """Try multiple strategies to extract page title.""" + + # Strategy 1: MediaWiki heading + h1 = soup.find("h1", id="firstHeading") + if h1: + return h1.get_text(strip=True) + + # Strategy 2: HTML title + if soup.title: + title = soup.title.get_text() + if "-" in title: + return title.split("-")[0].strip() + return title.strip() + + return None + + +def detect_redirect(soup: BeautifulSoup): + """Detect MediaWiki redirect pages.""" + text = soup.get_text(" ", strip=True).lower() + + # HTTrack redirects often contain this + if "#redirect" in text: + link = soup.find("a") + if link and link.get("href"): + return link["href"] + + # alternative pattern + redirect_note = soup.find(class_="redirectText") + if redirect_note: + link = redirect_note.find("a") + if link: + return link.get("href") + + return None + + +# ====================== +# MAIN +# ====================== + +title_registry = {} +alias_registry = {} +redirects = {} +unresolved = [] + +files = list(INPUT_DIR.glob("*.html")) +print(f"{len(files)} fichiers trouvés") + +for i, file_path in enumerate(files, 1): + + try: + html = file_path.read_text(encoding="utf-8", errors="ignore") + soup = BeautifulSoup(html, "html.parser") + + title = extract_title(soup) + + if not title: + unresolved.append({ + "file": file_path.name, + "reason": "no_title_found" + }) + continue + + key = slugify(title) + + # register canonical page + title_registry[key] = { + "title": title, + "file": str(file_path) + } + + # detect redirect + redirect_href = detect_redirect(soup) + if redirect_href: + target = Path(redirect_href).stem + target_key = slugify(target) + + redirects[key] = target_key + alias_registry[key] = target_key + + except Exception as e: + unresolved.append({ + "file": file_path.name, + "reason": str(e) + }) + + if i % 100 == 0: + print(f"{i}/{len(files)} traités") + +# ====================== +# AUTO ALIAS GENERATION +# ====================== + +auto_alias_count = 0 + +for key in list(title_registry.keys()): + stripped = slugify(strip_prefix(key)) + + if stripped != key and stripped in title_registry: + alias_registry[key] = stripped + auto_alias_count += 1 + +print(f"Alias automatiques ajoutés: {auto_alias_count}") + +# ====================== +# SAVE FILES +# ====================== + +with open(OUTPUT_DIR / "title_registry.json", "w", encoding="utf-8") as f: + json.dump(title_registry, f, indent=2, ensure_ascii=False) + +with open(OUTPUT_DIR / "alias_registry.json", "w", encoding="utf-8") as f: + json.dump(alias_registry, f, indent=2, ensure_ascii=False) + +with open(OUTPUT_DIR / "redirects_detected.json", "w", encoding="utf-8") as f: + json.dump(redirects, f, indent=2, ensure_ascii=False) + +with open(OUTPUT_DIR / "unresolved_pages.json", "w", encoding="utf-8") as f: + json.dump(unresolved, f, indent=2, ensure_ascii=False) + +print("\n✅ REGISTRY BUILD COMPLETE") +print(f"Pages uniques: {len(title_registry)}") +print(f"Alias: {len(alias_registry)}") +print(f"Redirects: {len(redirects)}") +print(f"Problèmes: {len(unresolved)}") \ No newline at end of file diff --git a/category_graph_analysis.py b/category_graph_analysis.py index 35f97ba..2bc5034 100644 --- a/category_graph_analysis.py +++ b/category_graph_analysis.py @@ -3,8 +3,8 @@ import re from pathlib import Path from collections import defaultdict, Counter -INPUT_DIR = Path(".") # dossier contenant les fichiers -OUTPUT_FILE = "category_analysis.json" +INPUT_DIR = Path("../unique_pages") +OUTPUT_FILE = "../category_analysis.json" # --------------------------- diff --git a/extract_categories.py b/extract_categories.py index bd1ef55..814dca9 100644 --- a/extract_categories.py +++ b/extract_categories.py @@ -7,8 +7,8 @@ from collections import defaultdict # CONFIG # ========================= -INPUT_DIR = "." # dossier contenant les 700 fichiers -OUTPUT_FILE = "categories.json" +INPUT_DIR = "../original_index" # dossier contenant les 700 fichiers +OUTPUT_FILE = "../categories.json" # extensions à analyser VALID_EXTENSIONS = {".html", ".htm", ".txt", ".js"} diff --git a/remove_duplicate_pages.py b/remove_duplicate_pages.py index 20e5d8c..06f9413 100644 --- a/remove_duplicate_pages.py +++ b/remove_duplicate_pages.py @@ -4,8 +4,8 @@ import re import json -INPUT_DIR = Path(".") -OUTPUT_DIR = Path("unique_pages") +INPUT_DIR = Path("../original_index") +OUTPUT_DIR = Path("../unique_pages") OUTPUT_DIR.mkdir(exist_ok=True) ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)') diff --git a/scan_internal_links.py b/scan_internal_links.py new file mode 100644 index 0000000..e2a64c9 --- /dev/null +++ b/scan_internal_links.py @@ -0,0 +1,122 @@ +from pathlib import Path +import json +from bs4 import BeautifulSoup +from urllib.parse import urlparse + +INPUT_DIR = Path("../unique_pages") +REGISTRY_DIR = Path("../link_registry") + +title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8")) +alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8")) + +OUTPUT_RESOLVED = [] +OUTPUT_UNRESOLVED = [] + +# ====================== +# HELPERS +# ====================== + +def normalize_href(href: str): + if not href: + return None + + # ignore external links + if href.startswith("http"): + return None + + name = Path(href).stem + return name.lower() + + +def resolve(name): + if name in title_registry: + return name + + if name in alias_registry: + return alias_registry[name] + + # try removing category prefix + if name.startswith("category_"): + alt = name.replace("category_", "", 1) + if alt in title_registry: + return alt + + return None + + +def extract_article_links(soup): + + content = soup.find("div", id="mw-content-text") + if not content: + return [] + + links = [] + + for a in content.find_all("a", href=True): + + href = a["href"] + + # ignore anchors + if href.startswith("#"): + continue + + # ignore files/images/history/etc + if any(prefix in href.lower() for prefix in [ + "file_", + "image:", + "special:", + "action=", + ]): + continue + + links.append(href) + + return links + + +# ====================== +# MAIN +# ====================== + +files = list(INPUT_DIR.glob("*.html")) +print(f"{len(files)} pages à analyser") + +for i, file_path in enumerate(files, 1): + + html = file_path.read_text(encoding="utf-8", errors="ignore") + soup = BeautifulSoup(html, "html.parser") + + links = extract_article_links(soup) + + for href in links: + + key = normalize_href(href) + if not key: + continue + + resolved = resolve(key) + + entry = { + "source": file_path.name, + "link": href, + } + + if resolved: + entry["target"] = resolved + OUTPUT_RESOLVED.append(entry) + else: + OUTPUT_UNRESOLVED.append(entry) + + if i % 100 == 0: + print(f"{i}/{len(files)} analysées") + +# ====================== +# SAVE +# ====================== + +json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2) +json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2) + +print("\n✅ LINK SCAN COMPLETE") +print("Resolved:", len(OUTPUT_RESOLVED)) +print("Unresolved:", len(OUTPUT_UNRESOLVED)) \ No newline at end of file diff --git a/sort_pages.py b/sort_pages.py index 59eeaef..bd1b1de 100644 --- a/sort_pages.py +++ b/sort_pages.py @@ -3,8 +3,8 @@ import shutil import re import json -INPUT_DIR = Path("unique_pages") -OUTPUT_DIR = Path("classified_pages") +INPUT_DIR = Path("../unique_pages") +OUTPUT_DIR = Path("../classified_pages") CATEGORY_RE = re.compile(r'"wgCategories":\[(.*?)\]')