detect error pages

2026-04-10 13:42:42 +02:00 · 2026-04-10 13:42:42 +02:00 · 313c92e928
commit 313c92e928
parent 6cda3cf523
1 changed files with 73 additions and 4 deletions
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@ -29,8 +29,42 @@ IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
 NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
 WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
 SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
 UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
 ERROR_PAGE_PATTERNS = [
    # 5xx
    "503 service unavailable",
    "502 bad gateway",
    "500 internal server error",
    "504 gateway time",
    # 4xx
    "400 bad request",
    "401 unauthorized",
    "403 forbidden",
    "404 not found",
    "408 request time",
    "419 page expired",
    "429 too many requests",
    # génériques
    "temporarily busy",
    "server error",
    "internal error",
    "page not found",
    "request could not be satisfied",
 ]
 def is_error_page(page_html: str) -> bool:
    text = page_html.lower()
    if "<html" not in text:
        return False
    for pattern in ERROR_PAGE_PATTERNS:
        if pattern in text:
            return True
    return False
 def decode_mediawiki_string(s: str) -> str:
    if not s:
@ -177,7 +211,24 @@ for i, path in enumerate(files, 1):
        article_id = extract_article_id(page_html)
        if not article_id:
-            ignored_pages.append(path.name)
+            if not is_error_page(page_html):
                ignored_pages.append(path.name)
                continue
            fake_title = normalize_title(path.stem)
            all_variants[f"error::{fake_title}"].append({
                "path": path,
                "title": fake_title,
                "canonical_key": fake_title,
                "article_id": None,
                "wg_title": None,
                "redirect": False,
                "is_category": False,
                "is_listing_only": False,
                "is_error": True,
            })
            problems.append(f"Error page detected: {path.name}")
            continue
        title = extract_page_identity(page_html)
@ -244,6 +295,7 @@ for i, path in enumerate(files, 1):
            "redirect": is_redirect,
            "is_category": is_category,
            "is_listing_only": is_listing_only,
            "is_error": False,
        })
    except Exception as e:
@ -263,7 +315,7 @@ potential_tags = defaultdict(list)
 equivalences = {}
 category_renamed = 0
 category_not_chosen = 0
-
+error_pages = []
 def slug_to_title(filename: str) -> str:
    name = Path(filename).stem
@ -302,7 +354,8 @@ def variant_score(v):
        "_" in filename or
        len(filename) > 40
    )
-
+    if v.get("is_error"):
        return (True, True, True, True, 0, 9999, "zzz")
    return (
        v["is_listing_only"],
        v["redirect"],
@ -337,12 +390,25 @@ for article_id, variants in all_variants.items():
                potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
        continue
    if all(v.get("is_error") for v in variants):
        chosen_variant = variants[0]
        canonical_pages[article_id] = {
            "path": chosen_variant["path"],
            "title": normalize_reference_key(chosen_variant["path"].stem),
            "redirect": False,
        }
        error_pages.append(chosen_variant["path"].name)
        continue
    canonical_pages[article_id] = {
        "path": chosen["path"],
        "title": canonical_slug,
        "redirect": chosen["redirect"],
    }
    if chosen.get("is_error"):
        error_pages.append(chosen["path"].name)
    for v in variants:
        if v["is_category"] and not v["is_listing_only"]:
            # catégorie non choisie
@ -360,6 +426,7 @@ print(f"{len(canonical_pages)} pages canoniques")
 print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
 print(f"{category_renamed} pages prefix 'category_*' renommées")
 print(f"{len(potential_tags)} potential_tags enregistrés")
 print(f"{len(error_pages)} error_pages détectées")
 # --------------------------------------------------
 # PASS 3 — resolve redirects
@ -479,6 +546,7 @@ registry = {
    "equivalences": equivalences,
    "potential_tags": potential_tags,
    "ignored_pages": ignored_pages,
    "error_pages": error_pages,
 }
 REGISTRY_PATH.parent.mkdir(exist_ok=True)
@ -495,6 +563,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
    f.write(f"Equivalences: {len(equivalences)}\n")
    f.write(f"Ignored: {len(ignored_pages)}\n")
    f.write(f"Problems: {len(problems)}\n\n")
    f.write(f"Error pages: {len(error_pages)}\n")
    for p in problems:
        f.write(p + "\n")