diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index d97282a..52416d2 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -29,8 +29,42 @@ IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true') NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"') WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"') SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$") - UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})') +ERROR_PAGE_PATTERNS = [ + # 5xx + "503 service unavailable", + "502 bad gateway", + "500 internal server error", + "504 gateway time", + + # 4xx + "400 bad request", + "401 unauthorized", + "403 forbidden", + "404 not found", + "408 request time", + "419 page expired", + "429 too many requests", + + # génériques + "temporarily busy", + "server error", + "internal error", + "page not found", + "request could not be satisfied", +] + +def is_error_page(page_html: str) -> bool: + text = page_html.lower() + + if " str: if not s: @@ -177,7 +211,24 @@ for i, path in enumerate(files, 1): article_id = extract_article_id(page_html) if not article_id: - ignored_pages.append(path.name) + if not is_error_page(page_html): + ignored_pages.append(path.name) + continue + + fake_title = normalize_title(path.stem) + + all_variants[f"error::{fake_title}"].append({ + "path": path, + "title": fake_title, + "canonical_key": fake_title, + "article_id": None, + "wg_title": None, + "redirect": False, + "is_category": False, + "is_listing_only": False, + "is_error": True, + }) + problems.append(f"Error page detected: {path.name}") continue title = extract_page_identity(page_html) @@ -244,6 +295,7 @@ for i, path in enumerate(files, 1): "redirect": is_redirect, "is_category": is_category, "is_listing_only": is_listing_only, + "is_error": False, }) except Exception as e: @@ -263,7 +315,7 @@ potential_tags = defaultdict(list) equivalences = {} category_renamed = 0 category_not_chosen = 0 - +error_pages = [] def slug_to_title(filename: str) -> str: name = Path(filename).stem @@ -302,7 +354,8 @@ def variant_score(v): "_" in filename or len(filename) > 40 ) - + if v.get("is_error"): + return (True, True, True, True, 0, 9999, "zzz") return ( v["is_listing_only"], v["redirect"], @@ -337,12 +390,25 @@ for article_id, variants in all_variants.items(): potential_tags[tag_name].append(normalize_reference_key(v["wg_title"])) continue + if all(v.get("is_error") for v in variants): + chosen_variant = variants[0] + canonical_pages[article_id] = { + "path": chosen_variant["path"], + "title": normalize_reference_key(chosen_variant["path"].stem), + "redirect": False, + } + error_pages.append(chosen_variant["path"].name) + continue + canonical_pages[article_id] = { "path": chosen["path"], "title": canonical_slug, "redirect": chosen["redirect"], } + if chosen.get("is_error"): + error_pages.append(chosen["path"].name) + for v in variants: if v["is_category"] and not v["is_listing_only"]: # catégorie non choisie @@ -360,6 +426,7 @@ print(f"{len(canonical_pages)} pages canoniques") print(f"{category_not_chosen} pages homonymes 'category_*' non retenues") print(f"{category_renamed} pages prefix 'category_*' renommées") print(f"{len(potential_tags)} potential_tags enregistrés") +print(f"{len(error_pages)} error_pages détectées") # -------------------------------------------------- # PASS 3 — resolve redirects @@ -479,6 +546,7 @@ registry = { "equivalences": equivalences, "potential_tags": potential_tags, "ignored_pages": ignored_pages, + "error_pages": error_pages, } REGISTRY_PATH.parent.mkdir(exist_ok=True) @@ -495,6 +563,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f: f.write(f"Equivalences: {len(equivalences)}\n") f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Problems: {len(problems)}\n\n") + f.write(f"Error pages: {len(error_pages)}\n") for p in problems: f.write(p + "\n")