fix maaping canonical preference
This commit is contained in:
parent
90dd3cc152
commit
7f019ed98c
2 changed files with 242 additions and 72 deletions
|
|
@ -1,48 +1,100 @@
|
|||
from pathlib import Path
|
||||
import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlparse, parse_qs, unquote
|
||||
|
||||
INPUT_DIR = Path("../unique_pages")
|
||||
REGISTRY_DIR = Path("../link_registry")
|
||||
# --------------------------------------------------
|
||||
# PATHS
|
||||
# --------------------------------------------------
|
||||
|
||||
title_registry = json.load(open(REGISTRY_DIR / "title_registry.json", encoding="utf-8"))
|
||||
alias_registry = json.load(open(REGISTRY_DIR / "alias_registry.json", encoding="utf-8"))
|
||||
PAGES_DIR = Path("../output/pages")
|
||||
REGISTRY_PATH = Path("../output/equivalence_registry.json")
|
||||
OUTPUT_DIR = Path("../output/link_scan")
|
||||
|
||||
OUTPUT_RESOLVED = []
|
||||
OUTPUT_UNRESOLVED = []
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# ======================
|
||||
# --------------------------------------------------
|
||||
# LOAD REGISTRY
|
||||
# --------------------------------------------------
|
||||
|
||||
registry = json.load(open(REGISTRY_PATH, encoding="utf-8"))
|
||||
|
||||
equivalences = registry["equivalences"]
|
||||
canonical_pages = registry["canonical_pages"]
|
||||
|
||||
valid_targets = set(canonical_pages.values())
|
||||
|
||||
# --------------------------------------------------
|
||||
# HELPERS
|
||||
# ======================
|
||||
# --------------------------------------------------
|
||||
|
||||
def normalize_title(title: str | None):
|
||||
if not title:
|
||||
return None
|
||||
|
||||
title = unquote(title)
|
||||
title = title.replace("_", " ")
|
||||
title = re.sub(r"\s+", " ", title.strip())
|
||||
return title.casefold()
|
||||
|
||||
|
||||
# -------------------------
|
||||
# Extract MediaWiki target
|
||||
# -------------------------
|
||||
|
||||
def extract_mediawiki_target(href: str):
|
||||
|
||||
def normalize_href(href: str):
|
||||
if not href:
|
||||
return None
|
||||
|
||||
# ignore external links
|
||||
if href.startswith("http"):
|
||||
# ignore anchors
|
||||
if href.startswith("#"):
|
||||
return None
|
||||
|
||||
name = Path(href).stem
|
||||
return name.lower()
|
||||
parsed = urlparse(href)
|
||||
|
||||
# external link
|
||||
if parsed.scheme in ("http", "https"):
|
||||
return None
|
||||
|
||||
path = parsed.path or ""
|
||||
|
||||
# /wiki/Page_Name
|
||||
if "/wiki/" in path:
|
||||
return path.split("/wiki/", 1)[1]
|
||||
|
||||
# index.php?title=Page
|
||||
if "index.php" in path:
|
||||
qs = parse_qs(parsed.query)
|
||||
if "title" in qs:
|
||||
return qs["title"][0]
|
||||
|
||||
# fallback filename-like
|
||||
return Path(path).stem
|
||||
|
||||
|
||||
def resolve(name):
|
||||
if name in title_registry:
|
||||
return name
|
||||
# -------------------------
|
||||
# Ignore unwanted namespaces
|
||||
# -------------------------
|
||||
|
||||
if name in alias_registry:
|
||||
return alias_registry[name]
|
||||
IGNORED_PREFIXES = (
|
||||
"file:",
|
||||
"image:",
|
||||
"template:",
|
||||
"special:",
|
||||
"help:",
|
||||
"user:",
|
||||
"talk:",
|
||||
)
|
||||
|
||||
# try removing category prefix
|
||||
if name.startswith("category_"):
|
||||
alt = name.replace("category_", "", 1)
|
||||
if alt in title_registry:
|
||||
return alt
|
||||
def is_ignored_namespace(title_norm: str):
|
||||
return title_norm.startswith(IGNORED_PREFIXES)
|
||||
|
||||
return None
|
||||
|
||||
# -------------------------
|
||||
# Extract article content
|
||||
# -------------------------
|
||||
|
||||
def extract_article_links(soup):
|
||||
|
||||
|
|
@ -52,33 +104,26 @@ def extract_article_links(soup):
|
|||
|
||||
links = []
|
||||
|
||||
for a in content.find_all("a", href=True):
|
||||
for a in content.select("a[href]"):
|
||||
|
||||
href = a["href"]
|
||||
|
||||
# ignore anchors
|
||||
if href.startswith("#"):
|
||||
continue
|
||||
|
||||
# ignore files/images/history/etc
|
||||
if any(prefix in href.lower() for prefix in [
|
||||
"file_",
|
||||
"image:",
|
||||
"special:",
|
||||
"action=",
|
||||
]):
|
||||
# ignore navboxes / metadata
|
||||
if a.find_parent(class_="navbox"):
|
||||
continue
|
||||
|
||||
href = a.get("href")
|
||||
links.append(href)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
# ======================
|
||||
# MAIN
|
||||
# ======================
|
||||
# --------------------------------------------------
|
||||
# MAIN SCAN
|
||||
# --------------------------------------------------
|
||||
|
||||
files = list(INPUT_DIR.glob("*.html"))
|
||||
resolved_links = []
|
||||
unresolved_links = []
|
||||
|
||||
files = list(PAGES_DIR.glob("*.html"))
|
||||
print(f"{len(files)} pages à analyser")
|
||||
|
||||
for i, file_path in enumerate(files, 1):
|
||||
|
|
@ -90,33 +135,50 @@ for i, file_path in enumerate(files, 1):
|
|||
|
||||
for href in links:
|
||||
|
||||
key = normalize_href(href)
|
||||
if not key:
|
||||
raw_target = extract_mediawiki_target(href)
|
||||
norm = normalize_title(raw_target)
|
||||
|
||||
if not norm:
|
||||
continue
|
||||
|
||||
resolved = resolve(key)
|
||||
if is_ignored_namespace(norm):
|
||||
continue
|
||||
|
||||
entry = {
|
||||
"source": file_path.name,
|
||||
"link": href,
|
||||
"href": href,
|
||||
"normalized": norm,
|
||||
}
|
||||
|
||||
resolved = equivalences.get(norm)
|
||||
|
||||
if resolved:
|
||||
entry["target"] = resolved
|
||||
OUTPUT_RESOLVED.append(entry)
|
||||
entry["resolved_title"] = resolved
|
||||
resolved_links.append(entry)
|
||||
else:
|
||||
OUTPUT_UNRESOLVED.append(entry)
|
||||
unresolved_links.append(entry)
|
||||
|
||||
if i % 100 == 0:
|
||||
print(f"{i}/{len(files)} analysées")
|
||||
|
||||
# ======================
|
||||
# SAVE
|
||||
# ======================
|
||||
# --------------------------------------------------
|
||||
# SAVE RESULTS
|
||||
# --------------------------------------------------
|
||||
|
||||
json.dump(OUTPUT_RESOLVED, open(REGISTRY_DIR / "resolved_links.json","w",encoding="utf-8"), indent=2)
|
||||
json.dump(OUTPUT_UNRESOLVED, open(REGISTRY_DIR / "unresolved_links.json","w",encoding="utf-8"), indent=2)
|
||||
json.dump(
|
||||
resolved_links,
|
||||
open(OUTPUT_DIR / "resolved_links.json", "w", encoding="utf-8"),
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
json.dump(
|
||||
unresolved_links,
|
||||
open(OUTPUT_DIR / "unresolved_links.json", "w", encoding="utf-8"),
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
print("\n✅ LINK SCAN COMPLETE")
|
||||
print("Resolved:", len(OUTPUT_RESOLVED))
|
||||
print("Unresolved:", len(OUTPUT_UNRESOLVED))
|
||||
print("Resolved:", len(resolved_links))
|
||||
print("Unresolved:", len(unresolved_links))
|
||||
Loading…
Add table
Add a link
Reference in a new issue