diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index 8a619ba..1b3ea96 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -5,6 +5,7 @@ import shutil import html from pathlib import Path from collections import defaultdict +from difflib import SequenceMatcher SOURCE_DIR = Path("../original_index") OUTPUT_DIR = Path("../output") @@ -24,6 +25,11 @@ INVALID_WIN_CHARS = r'[<>:"/\\|?*]' ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)') IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true') NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"') +WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"') +SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$") + +def similarity(a, b): + return SequenceMatcher(None, a, b).ratio() def normalize_title(title: str) -> str: title = title.strip() @@ -77,6 +83,25 @@ def extract_namespace(html: str) -> str: return "" +def extract_wg_title(page_html): + m = WG_TITLE_RE.search(page_html) + if m: + return html.unescape(m.group(1)) + return None + + +def normalize_reference_key(key: str) -> str: + key = normalize_title(key) + + # normalise namespace category + key = re.sub(r"^category[\s:_]+", "", key) + + # collapse espaces + key = re.sub(r"\s+", " ", key) + + return key.strip() + + # -------------------------------------------------- # Registry structures # -------------------------------------------------- @@ -121,6 +146,7 @@ for i, path in enumerate(files, 1): base_title = norm is_redirect = bool(IS_REDIRECT_RE.search(page_html)) is_category = ns == "Category" or norm.startswith("category:") + wg_title = extract_wg_title(page_html) # Categories if ns == "Category": @@ -148,11 +174,13 @@ for i, path in enumerate(files, 1): if redir: redirects[full_title] = normalize_title(redir) + canonical_key = normalize_reference_key(full_title) all_variants[article_id].append({ "path": path, "title": base_title, - "full_title": full_title, + "canonical_key": full_title, "article_id": article_id, + "wg_title": normalize_title(wg_title) if wg_title else None, "redirect": is_redirect, "is_category": is_category, }) @@ -162,6 +190,7 @@ for i, path in enumerate(files, 1): if i % 200 == 0: print(f"{i}/{len(files)} analysés") +print("Variants collected:", len(all_variants)) # -------------------------------------------------- # PASS 2 — choix des versions canoniques @@ -173,15 +202,52 @@ category_replaced = 0 nb_all_cat = 0 +def slug_to_title(filename: str) -> str: + name = Path(filename).stem + name = re.sub(r"\d+$", "", name) + return normalize_title(name) + + +def filename_similarity_score(filename, wg_title): + if not wg_title: + return 0 + + filename = normalize_title(filename) + wg_title = normalize_title(wg_title) + + # enlève chiffres suffixes + filename = re.sub(r"\d+$", "", filename) + + return similarity(filename, wg_title) + def variant_score(v): - """ - Plus le score est petit → meilleur candidat. - """ + + filename = v["path"].stem + filename_norm = normalize_title(filename) + + similarity_score = filename_similarity_score( + filename_norm, + v["wg_title"] + ) + + is_short_slug = bool( + SHORT_SLUG_RE.match(filename_norm.replace(" ", "")) + ) + + long_title_penalty = ( + "," in filename or + "_" in filename or + len(filename) > 40 + ) + return ( - v["is_category"], # False (0) meilleur que True (1) - v["redirect"], # False meilleur - "category:" in v["path"].name.lower(), # sécurité filename - len(v["path"].name), # stabilité + v["is_category"], + v["redirect"], + not is_short_slug, + long_title_penalty, + -similarity_score, + len(filename), + filename.lower(), ) @@ -189,6 +255,7 @@ for article_id, variants in all_variants.items(): # tri déterministe variants_sorted = sorted(variants, key=variant_score) + print(f"variants_sorted: {variants_sorted}") chosen = variants_sorted[0] @@ -198,16 +265,34 @@ for article_id, variants in all_variants.items(): if chosen["is_category"]: category_replaced += 1 + canonical_title = normalize_reference_key(chosen["title"]) + canonical_pages[article_id] = { "path": chosen["path"], - "title": chosen["title"], + "title": canonical_title, "redirect": chosen["redirect"], } # équivalences for v in variants: - equivalences[v["full_title"]] = chosen["title"] + equivalences[v["canonical_key"]] = chosen["title"] +equivalences.clear() + +def add_equivalence(k, v): + k = normalize_reference_key(k) + v = normalize_reference_key(v) + + if k != v: + equivalences[k] = v + +for article_id, variants in all_variants.items(): + canonical_title = canonical_pages[article_id]["title"] + canonical_slug = Path(canonical_pages[article_id]["path"]).stem + for v in variants: + add_equivalence(v["canonical_key"], canonical_slug) + filename_key = normalize_title(Path(v["path"]).stem) + add_equivalence(filename_key, canonical_slug) print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}") print(f"{category_replaced} 'category_*' remplacées par leur version de base") @@ -231,7 +316,7 @@ for src, dst in list(redirects.items()): equivalences[src] = final redirects.clear() - # -------------------------------------------------- +# -------------------------------------------------- # PASS 4 — normalisation finale des equivalences # -------------------------------------------------- @@ -243,21 +328,41 @@ valid_titles = { for k, v in list(equivalences.items()): if v not in valid_titles: equivalences[k] = equivalences.get(v, v) - +# category:* ou category_* comme clés +for k, v in list(equivalences.items()): + new_k = re.sub(r"^category[\s:_]+", "category ", k) + if new_k != k: + equivalences[new_k] = v + del equivalences[k] # invariant registry for k, v in equivalences.items(): if v not in valid_titles: problems.append(f"Non canonical mapping: {k} -> {v}") - +equivalences = { + k: v for k, v in equivalences.items() + if k != v +} # -------------------------------------------------- # PASS 5 — copie des pages canoniques # -------------------------------------------------- + +def title_to_filename(title: str) -> str: + return sanitize_filename( + title.replace(" ", "_").casefold() + ".html" + ) + + copied = 0 -for key, data in canonical_pages.items(): +total = len(canonical_pages) + +for i, (key, data) in enumerate(canonical_pages.items(), 1): + src = data["path"] - dst_name = sanitize_filename(src.name) + + dst_name = sanitize_filename(src.name.casefold()) dst = PAGES_DIR / dst_name + try: shutil.copy2(src, dst) canonical_pages[key] = dst_name @@ -265,6 +370,9 @@ for key, data in canonical_pages.items(): except Exception as e: problems.append(f"Copy failed {src}: {e}") + if i % 200 == 0 or i == total: + print(f"{i}/{total} copiés") + print(f"{copied} pages copiées") # -------------------------------------------------- diff --git a/scan_internal_links.py b/scan_internal_links.py index e2a64c9..0858a57 100644 --- a/scan_internal_links.py +++ b/scan_internal_links.py @@ -1,48 +1,100 @@ from pathlib import Path import json +import re from bs4 import BeautifulSoup -from urllib.parse import urlparse +from urllib.parse import urlparse, parse_qs, unquote -INPUT_DIR = Path("../unique_pages") -REGISTRY_DIR = Path("../link_registry") +# -------------------------------------------------- +# PATHS +# -------------------------------------------------- -title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8")) -alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8")) +PAGES_DIR = Path("../output/pages") +REGISTRY_PATH = Path("../output/equivalence_registry.json") +OUTPUT_DIR = Path("../output/link_scan") -OUTPUT_RESOLVED = [] -OUTPUT_UNRESOLVED = [] +OUTPUT_DIR.mkdir(exist_ok=True) -# ====================== +# -------------------------------------------------- +# LOAD REGISTRY +# -------------------------------------------------- + +registry = json.load(open(REGISTRY_PATH, encoding="utf-8")) + +equivalences = registry["equivalences"] +canonical_pages = registry["canonical_pages"] + +valid_targets = set(canonical_pages.values()) + +# -------------------------------------------------- # HELPERS -# ====================== +# -------------------------------------------------- + +def normalize_title(title: str | None): + if not title: + return None + + title = unquote(title) + title = title.replace("_", " ") + title = re.sub(r"\s+", " ", title.strip()) + return title.casefold() + + +# ------------------------- +# Extract MediaWiki target +# ------------------------- + +def extract_mediawiki_target(href: str): -def normalize_href(href: str): if not href: return None - # ignore external links - if href.startswith("http"): + # ignore anchors + if href.startswith("#"): return None - name = Path(href).stem - return name.lower() + parsed = urlparse(href) + + # external link + if parsed.scheme in ("http", "https"): + return None + + path = parsed.path or "" + + # /wiki/Page_Name + if "/wiki/" in path: + return path.split("/wiki/", 1)[1] + + # index.php?title=Page + if "index.php" in path: + qs = parse_qs(parsed.query) + if "title" in qs: + return qs["title"][0] + + # fallback filename-like + return Path(path).stem -def resolve(name): - if name in title_registry: - return name +# ------------------------- +# Ignore unwanted namespaces +# ------------------------- - if name in alias_registry: - return alias_registry[name] +IGNORED_PREFIXES = ( + "file:", + "image:", + "template:", + "special:", + "help:", + "user:", + "talk:", +) - # try removing category prefix - if name.startswith("category_"): - alt = name.replace("category_", "", 1) - if alt in title_registry: - return alt +def is_ignored_namespace(title_norm: str): + return title_norm.startswith(IGNORED_PREFIXES) - return None +# ------------------------- +# Extract article content +# ------------------------- def extract_article_links(soup): @@ -52,33 +104,26 @@ def extract_article_links(soup): links = [] - for a in content.find_all("a", href=True): + for a in content.select("a[href]"): - href = a["href"] - - # ignore anchors - if href.startswith("#"): - continue - - # ignore files/images/history/etc - if any(prefix in href.lower() for prefix in [ - "file_", - "image:", - "special:", - "action=", - ]): + # ignore navboxes / metadata + if a.find_parent(class_="navbox"): continue + href = a.get("href") links.append(href) return links -# ====================== -# MAIN -# ====================== +# -------------------------------------------------- +# MAIN SCAN +# -------------------------------------------------- -files = list(INPUT_DIR.glob("*.html")) +resolved_links = [] +unresolved_links = [] + +files = list(PAGES_DIR.glob("*.html")) print(f"{len(files)} pages à analyser") for i, file_path in enumerate(files, 1): @@ -90,33 +135,50 @@ for i, file_path in enumerate(files, 1): for href in links: - key = normalize_href(href) - if not key: + raw_target = extract_mediawiki_target(href) + norm = normalize_title(raw_target) + + if not norm: continue - resolved = resolve(key) + if is_ignored_namespace(norm): + continue entry = { "source": file_path.name, - "link": href, + "href": href, + "normalized": norm, } + resolved = equivalences.get(norm) + if resolved: - entry["target"] = resolved - OUTPUT_RESOLVED.append(entry) + entry["resolved_title"] = resolved + resolved_links.append(entry) else: - OUTPUT_UNRESOLVED.append(entry) + unresolved_links.append(entry) if i % 100 == 0: print(f"{i}/{len(files)} analysées") -# ====================== -# SAVE -# ====================== +# -------------------------------------------------- +# SAVE RESULTS +# -------------------------------------------------- -json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2) -json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2) +json.dump( + resolved_links, + open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"), + indent=2, + ensure_ascii=False, +) + +json.dump( + unresolved_links, + open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"), + indent=2, + ensure_ascii=False, +) print("\n✅ LINK SCAN COMPLETE") -print("Resolved:", len(OUTPUT_RESOLVED)) -print("Unresolved:", len(OUTPUT_UNRESOLVED)) \ No newline at end of file +print("Resolved:", len(resolved_links)) +print("Unresolved:", len(unresolved_links)) \ No newline at end of file