detect error pages
This commit is contained in:
parent
6cda3cf523
commit
313c92e928
1 changed files with 73 additions and 4 deletions
|
|
@ -29,8 +29,42 @@ IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
|
|||
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
|
||||
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
|
||||
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
|
||||
|
||||
UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
|
||||
ERROR_PAGE_PATTERNS = [
|
||||
# 5xx
|
||||
"503 service unavailable",
|
||||
"502 bad gateway",
|
||||
"500 internal server error",
|
||||
"504 gateway time",
|
||||
|
||||
# 4xx
|
||||
"400 bad request",
|
||||
"401 unauthorized",
|
||||
"403 forbidden",
|
||||
"404 not found",
|
||||
"408 request time",
|
||||
"419 page expired",
|
||||
"429 too many requests",
|
||||
|
||||
# génériques
|
||||
"temporarily busy",
|
||||
"server error",
|
||||
"internal error",
|
||||
"page not found",
|
||||
"request could not be satisfied",
|
||||
]
|
||||
|
||||
def is_error_page(page_html: str) -> bool:
|
||||
text = page_html.lower()
|
||||
|
||||
if "<html" not in text:
|
||||
return False
|
||||
|
||||
for pattern in ERROR_PAGE_PATTERNS:
|
||||
if pattern in text:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def decode_mediawiki_string(s: str) -> str:
|
||||
if not s:
|
||||
|
|
@ -177,7 +211,24 @@ for i, path in enumerate(files, 1):
|
|||
|
||||
article_id = extract_article_id(page_html)
|
||||
if not article_id:
|
||||
ignored_pages.append(path.name)
|
||||
if not is_error_page(page_html):
|
||||
ignored_pages.append(path.name)
|
||||
continue
|
||||
|
||||
fake_title = normalize_title(path.stem)
|
||||
|
||||
all_variants[f"error::{fake_title}"].append({
|
||||
"path": path,
|
||||
"title": fake_title,
|
||||
"canonical_key": fake_title,
|
||||
"article_id": None,
|
||||
"wg_title": None,
|
||||
"redirect": False,
|
||||
"is_category": False,
|
||||
"is_listing_only": False,
|
||||
"is_error": True,
|
||||
})
|
||||
problems.append(f"Error page detected: {path.name}")
|
||||
continue
|
||||
|
||||
title = extract_page_identity(page_html)
|
||||
|
|
@ -244,6 +295,7 @@ for i, path in enumerate(files, 1):
|
|||
"redirect": is_redirect,
|
||||
"is_category": is_category,
|
||||
"is_listing_only": is_listing_only,
|
||||
"is_error": False,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -263,7 +315,7 @@ potential_tags = defaultdict(list)
|
|||
equivalences = {}
|
||||
category_renamed = 0
|
||||
category_not_chosen = 0
|
||||
|
||||
error_pages = []
|
||||
|
||||
def slug_to_title(filename: str) -> str:
|
||||
name = Path(filename).stem
|
||||
|
|
@ -302,7 +354,8 @@ def variant_score(v):
|
|||
"_" in filename or
|
||||
len(filename) > 40
|
||||
)
|
||||
|
||||
if v.get("is_error"):
|
||||
return (True, True, True, True, 0, 9999, "zzz")
|
||||
return (
|
||||
v["is_listing_only"],
|
||||
v["redirect"],
|
||||
|
|
@ -337,12 +390,25 @@ for article_id, variants in all_variants.items():
|
|||
potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
|
||||
continue
|
||||
|
||||
if all(v.get("is_error") for v in variants):
|
||||
chosen_variant = variants[0]
|
||||
canonical_pages[article_id] = {
|
||||
"path": chosen_variant["path"],
|
||||
"title": normalize_reference_key(chosen_variant["path"].stem),
|
||||
"redirect": False,
|
||||
}
|
||||
error_pages.append(chosen_variant["path"].name)
|
||||
continue
|
||||
|
||||
canonical_pages[article_id] = {
|
||||
"path": chosen["path"],
|
||||
"title": canonical_slug,
|
||||
"redirect": chosen["redirect"],
|
||||
}
|
||||
|
||||
if chosen.get("is_error"):
|
||||
error_pages.append(chosen["path"].name)
|
||||
|
||||
for v in variants:
|
||||
if v["is_category"] and not v["is_listing_only"]:
|
||||
# catégorie non choisie
|
||||
|
|
@ -360,6 +426,7 @@ print(f"{len(canonical_pages)} pages canoniques")
|
|||
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
|
||||
print(f"{category_renamed} pages prefix 'category_*' renommées")
|
||||
print(f"{len(potential_tags)} potential_tags enregistrés")
|
||||
print(f"{len(error_pages)} error_pages détectées")
|
||||
|
||||
# --------------------------------------------------
|
||||
# PASS 3 — resolve redirects
|
||||
|
|
@ -479,6 +546,7 @@ registry = {
|
|||
"equivalences": equivalences,
|
||||
"potential_tags": potential_tags,
|
||||
"ignored_pages": ignored_pages,
|
||||
"error_pages": error_pages,
|
||||
}
|
||||
|
||||
REGISTRY_PATH.parent.mkdir(exist_ok=True)
|
||||
|
|
@ -495,6 +563,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
|||
f.write(f"Equivalences: {len(equivalences)}\n")
|
||||
f.write(f"Ignored: {len(ignored_pages)}\n")
|
||||
f.write(f"Problems: {len(problems)}\n\n")
|
||||
f.write(f"Error pages: {len(error_pages)}\n")
|
||||
for p in problems:
|
||||
f.write(p + "\n")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue