fix page dedup
This commit is contained in:
parent
c7b45432b1
commit
a3f3e61e7d
1 changed files with 88 additions and 66 deletions
|
|
@ -48,12 +48,9 @@ def extract_page_identity(html: str):
|
||||||
page = extract_wg_page_name(html)
|
page = extract_wg_page_name(html)
|
||||||
if page:
|
if page:
|
||||||
return page
|
return page
|
||||||
|
|
||||||
# fallback title tag
|
|
||||||
m = re.search(r"<title>(.*?) -", html, re.I)
|
m = re.search(r"<title>(.*?) -", html, re.I)
|
||||||
if m:
|
if m:
|
||||||
return html.unescape(m.group(1))
|
return html.unescape(m.group(1))
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -66,18 +63,6 @@ def extract_article_id(html: str) -> int | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_redirect(html: str) -> str | None:
|
|
||||||
m = re.search(r"#REDIRECT\s*\[\[(.*?)]]", html, re.I)
|
|
||||||
if m:
|
|
||||||
return m.group(1).strip()
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def namespace_of(title: str):
|
|
||||||
if ":" in title:
|
|
||||||
return title.split(":", 1)[0]
|
|
||||||
|
|
||||||
|
|
||||||
def extract_internal_redirect(page_html: str):
|
def extract_internal_redirect(page_html: str):
|
||||||
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
|
m = re.search(r'"wgRedirectedFrom"\s*:\s*"([^"]+)"', page_html)
|
||||||
if m:
|
if m:
|
||||||
|
|
@ -91,26 +76,24 @@ def extract_namespace(html: str) -> str:
|
||||||
return m.group(1)
|
return m.group(1)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# Registry structures
|
# Registry structures
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
canonical_pages = {}
|
|
||||||
equivalences = {}
|
|
||||||
redirects = {}
|
|
||||||
ignored_pages = []
|
ignored_pages = []
|
||||||
problems = []
|
problems = []
|
||||||
|
redirects = {}
|
||||||
|
all_variants = defaultdict(list)
|
||||||
|
|
||||||
files = list(SOURCE_DIR.glob("*.html"))
|
files = list(SOURCE_DIR.glob("*.html"))
|
||||||
print(f"{len(files)} fichiers trouvés")
|
print(f"{len(files)} fichiers trouvés")
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 1 — analyse
|
# PASS 1 — analyse et collecte des variantes
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
for i, path in enumerate(files, 1):
|
for i, path in enumerate(files, 1):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
page_html = path.read_text(encoding="utf-8", errors="ignore")
|
page_html = path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
|
||||||
|
|
@ -126,50 +109,51 @@ for i, path in enumerate(files, 1):
|
||||||
|
|
||||||
ns = extract_namespace(page_html)
|
ns = extract_namespace(page_html)
|
||||||
|
|
||||||
# -------------------------
|
# Ignorer certains namespaces
|
||||||
# Ignore namespaces
|
|
||||||
# -------------------------
|
|
||||||
if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
|
if ns in ("File", "Template", "User", "Talk", "File talk", "Category talk", "User talk"):
|
||||||
ignored_pages.append(path.name)
|
ignored_pages.append(path.name)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
title = html.unescape(title)
|
title = html.unescape(title)
|
||||||
norm = normalize_title(title)
|
norm = normalize_title(title)
|
||||||
|
base_title = norm
|
||||||
|
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
||||||
|
is_category = ns == "Category" or norm.startswith("category:")
|
||||||
|
|
||||||
|
# redirect interne
|
||||||
# -------------------------
|
|
||||||
# Category pages
|
|
||||||
# -------------------------
|
|
||||||
# Category pages CAN be canonical content
|
|
||||||
if ns == "Category":
|
|
||||||
norm = normalize_title(title)
|
|
||||||
equivalences[norm] = norm
|
|
||||||
|
|
||||||
# -------------------------
|
|
||||||
# Redirect detection
|
|
||||||
# -------------------------
|
|
||||||
redir = extract_internal_redirect(page_html)
|
redir = extract_internal_redirect(page_html)
|
||||||
if redir:
|
if redir:
|
||||||
redirects[normalize_title(redir)] = norm
|
redirects[normalize_title(redir)] = norm
|
||||||
# -------------------------
|
|
||||||
# Canonical article
|
|
||||||
# -------------------------
|
|
||||||
|
|
||||||
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
# Categories
|
||||||
if article_id not in canonical_pages:
|
if ns == "Category":
|
||||||
canonical_pages[article_id] = {
|
m_title = re.search(r'"wgTitle"\s*:\s*"([^"]+)"', page_html)
|
||||||
"path": path,
|
|
||||||
"title": norm,
|
if m_title:
|
||||||
"redirect": is_redirect,
|
wg_title = html.unescape(m_title.group(1))
|
||||||
}
|
cat_base = normalize_title(wg_title)
|
||||||
elif canonical_pages[article_id]["redirect"] and not is_redirect:
|
|
||||||
canonical_pages[article_id] = {
|
page_name = extract_wg_page_name(page_html)
|
||||||
"path": path,
|
page_norm = normalize_title(page_name) if page_name else None
|
||||||
"title": norm,
|
|
||||||
"redirect": is_redirect,
|
if page_norm and page_norm != f"category:{cat_base}":
|
||||||
}
|
# page réelle déguisée en category
|
||||||
# self equivalence
|
base_title = page_norm
|
||||||
equivalences[norm] = norm
|
is_category = False
|
||||||
|
else:
|
||||||
|
base_title = cat_base
|
||||||
|
is_category = True
|
||||||
|
else:
|
||||||
|
base_title = norm.replace("category:", "", 1)
|
||||||
|
is_category = True
|
||||||
|
|
||||||
|
all_variants[article_id].append({
|
||||||
|
"path": path,
|
||||||
|
"title": base_title,
|
||||||
|
"article_id": article_id,
|
||||||
|
"redirect": is_redirect,
|
||||||
|
"is_category": is_category,
|
||||||
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
problems.append(f"{path}: {e}")
|
problems.append(f"{path}: {e}")
|
||||||
|
|
@ -177,9 +161,57 @@ for i, path in enumerate(files, 1):
|
||||||
if i % 200 == 0:
|
if i % 200 == 0:
|
||||||
print(f"{i}/{len(files)} analysés")
|
print(f"{i}/{len(files)} analysés")
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# PASS 2 — choix des versions canoniques
|
||||||
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
canonical_pages = {}
|
||||||
|
equivalences = {}
|
||||||
|
category_replaced = 0
|
||||||
|
nb_all_cat = 0
|
||||||
|
|
||||||
|
|
||||||
|
def variant_score(v):
|
||||||
|
"""
|
||||||
|
Plus le score est petit → meilleur candidat.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
v["is_category"], # False (0) meilleur que True (1)
|
||||||
|
v["redirect"], # False meilleur
|
||||||
|
"category:" in v["path"].name.lower(), # sécurité filename
|
||||||
|
len(v["path"].name), # stabilité
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
for article_id, variants in all_variants.items():
|
||||||
|
|
||||||
|
# tri déterministe
|
||||||
|
variants_sorted = sorted(variants, key=variant_score)
|
||||||
|
|
||||||
|
chosen = variants_sorted[0]
|
||||||
|
|
||||||
|
if all(v["is_category"] for v in variants):
|
||||||
|
nb_all_cat += 1
|
||||||
|
|
||||||
|
if chosen["is_category"]:
|
||||||
|
category_replaced += 1
|
||||||
|
|
||||||
|
canonical_pages[article_id] = {
|
||||||
|
"path": chosen["path"],
|
||||||
|
"title": chosen["title"],
|
||||||
|
"redirect": chosen["redirect"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# équivalences
|
||||||
|
for v in variants:
|
||||||
|
equivalences[v["title"]] = chosen["title"]
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
|
||||||
|
print(f"{category_replaced} 'category_*' remplacées par leur version de base")
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 2 — resolve redirects
|
# PASS 3 — resolve redirects
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
def resolve_redirect(key):
|
def resolve_redirect(key):
|
||||||
|
|
@ -189,24 +221,18 @@ def resolve_redirect(key):
|
||||||
key = redirects[key]
|
key = redirects[key]
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
for k, v in list(redirects.items()):
|
for k, v in list(redirects.items()):
|
||||||
equivalences[k] = resolve_redirect(v)
|
equivalences[k] = resolve_redirect(v)
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 3 — copy canonical pages
|
# PASS 4 — copie des pages canoniques
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
copied = 0
|
copied = 0
|
||||||
|
|
||||||
# for key, src in canonical_pages.items():
|
|
||||||
for key, data in canonical_pages.items():
|
for key, data in canonical_pages.items():
|
||||||
|
|
||||||
src = data["path"]
|
src = data["path"]
|
||||||
dst_name = sanitize_filename(src.name)
|
dst_name = sanitize_filename(src.name)
|
||||||
dst = PAGES_DIR / dst_name
|
dst = PAGES_DIR / dst_name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
shutil.copy2(src, dst)
|
shutil.copy2(src, dst)
|
||||||
canonical_pages[key] = dst_name
|
canonical_pages[key] = dst_name
|
||||||
|
|
@ -216,7 +242,6 @@ for key, data in canonical_pages.items():
|
||||||
|
|
||||||
print(f"{copied} pages copiées")
|
print(f"{copied} pages copiées")
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# SAVE REGISTRY
|
# SAVE REGISTRY
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
@ -229,11 +254,9 @@ registry = {
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTRY_PATH.parent.mkdir(exist_ok=True)
|
REGISTRY_PATH.parent.mkdir(exist_ok=True)
|
||||||
|
|
||||||
with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
|
with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
|
||||||
json.dump(registry, f, indent=2, ensure_ascii=False)
|
json.dump(registry, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# REPORT
|
# REPORT
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
@ -245,7 +268,6 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
||||||
f.write(f"Redirects: {len(redirects)}\n")
|
f.write(f"Redirects: {len(redirects)}\n")
|
||||||
f.write(f"Ignored: {len(ignored_pages)}\n")
|
f.write(f"Ignored: {len(ignored_pages)}\n")
|
||||||
f.write(f"Problems: {len(problems)}\n\n")
|
f.write(f"Problems: {len(problems)}\n\n")
|
||||||
|
|
||||||
for p in problems[:200]:
|
for p in problems[:200]:
|
||||||
f.write(p + "\n")
|
f.write(p + "\n")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue