detect error pages

This commit is contained in:
Maxime Réaux 2026-04-10 13:42:42 +02:00
parent 6cda3cf523
commit 313c92e928

View file

@ -29,8 +29,42 @@ IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"') NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"') WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$") SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})') UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
ERROR_PAGE_PATTERNS = [
# 5xx
"503 service unavailable",
"502 bad gateway",
"500 internal server error",
"504 gateway time",
# 4xx
"400 bad request",
"401 unauthorized",
"403 forbidden",
"404 not found",
"408 request time",
"419 page expired",
"429 too many requests",
# génériques
"temporarily busy",
"server error",
"internal error",
"page not found",
"request could not be satisfied",
]
def is_error_page(page_html: str) -> bool:
text = page_html.lower()
if "<html" not in text:
return False
for pattern in ERROR_PAGE_PATTERNS:
if pattern in text:
return True
return False
def decode_mediawiki_string(s: str) -> str: def decode_mediawiki_string(s: str) -> str:
if not s: if not s:
@ -177,7 +211,24 @@ for i, path in enumerate(files, 1):
article_id = extract_article_id(page_html) article_id = extract_article_id(page_html)
if not article_id: if not article_id:
ignored_pages.append(path.name) if not is_error_page(page_html):
ignored_pages.append(path.name)
continue
fake_title = normalize_title(path.stem)
all_variants[f"error::{fake_title}"].append({
"path": path,
"title": fake_title,
"canonical_key": fake_title,
"article_id": None,
"wg_title": None,
"redirect": False,
"is_category": False,
"is_listing_only": False,
"is_error": True,
})
problems.append(f"Error page detected: {path.name}")
continue continue
title = extract_page_identity(page_html) title = extract_page_identity(page_html)
@ -244,6 +295,7 @@ for i, path in enumerate(files, 1):
"redirect": is_redirect, "redirect": is_redirect,
"is_category": is_category, "is_category": is_category,
"is_listing_only": is_listing_only, "is_listing_only": is_listing_only,
"is_error": False,
}) })
except Exception as e: except Exception as e:
@ -263,7 +315,7 @@ potential_tags = defaultdict(list)
equivalences = {} equivalences = {}
category_renamed = 0 category_renamed = 0
category_not_chosen = 0 category_not_chosen = 0
error_pages = []
def slug_to_title(filename: str) -> str: def slug_to_title(filename: str) -> str:
name = Path(filename).stem name = Path(filename).stem
@ -302,7 +354,8 @@ def variant_score(v):
"_" in filename or "_" in filename or
len(filename) > 40 len(filename) > 40
) )
if v.get("is_error"):
return (True, True, True, True, 0, 9999, "zzz")
return ( return (
v["is_listing_only"], v["is_listing_only"],
v["redirect"], v["redirect"],
@ -337,12 +390,25 @@ for article_id, variants in all_variants.items():
potential_tags[tag_name].append(normalize_reference_key(v["wg_title"])) potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
continue continue
if all(v.get("is_error") for v in variants):
chosen_variant = variants[0]
canonical_pages[article_id] = {
"path": chosen_variant["path"],
"title": normalize_reference_key(chosen_variant["path"].stem),
"redirect": False,
}
error_pages.append(chosen_variant["path"].name)
continue
canonical_pages[article_id] = { canonical_pages[article_id] = {
"path": chosen["path"], "path": chosen["path"],
"title": canonical_slug, "title": canonical_slug,
"redirect": chosen["redirect"], "redirect": chosen["redirect"],
} }
if chosen.get("is_error"):
error_pages.append(chosen["path"].name)
for v in variants: for v in variants:
if v["is_category"] and not v["is_listing_only"]: if v["is_category"] and not v["is_listing_only"]:
# catégorie non choisie # catégorie non choisie
@ -360,6 +426,7 @@ print(f"{len(canonical_pages)} pages canoniques")
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues") print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
print(f"{category_renamed} pages prefix 'category_*' renommées") print(f"{category_renamed} pages prefix 'category_*' renommées")
print(f"{len(potential_tags)} potential_tags enregistrés") print(f"{len(potential_tags)} potential_tags enregistrés")
print(f"{len(error_pages)} error_pages détectées")
# -------------------------------------------------- # --------------------------------------------------
# PASS 3 — resolve redirects # PASS 3 — resolve redirects
@ -479,6 +546,7 @@ registry = {
"equivalences": equivalences, "equivalences": equivalences,
"potential_tags": potential_tags, "potential_tags": potential_tags,
"ignored_pages": ignored_pages, "ignored_pages": ignored_pages,
"error_pages": error_pages,
} }
REGISTRY_PATH.parent.mkdir(exist_ok=True) REGISTRY_PATH.parent.mkdir(exist_ok=True)
@ -495,6 +563,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write(f"Equivalences: {len(equivalences)}\n") f.write(f"Equivalences: {len(equivalences)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n") f.write(f"Problems: {len(problems)}\n\n")
f.write(f"Error pages: {len(error_pages)}\n")
for p in problems: for p in problems:
f.write(p + "\n") f.write(p + "\n")