detect error pages
This commit is contained in:
parent
6cda3cf523
commit
313c92e928
1 changed files with 73 additions and 4 deletions
|
|
@ -29,8 +29,42 @@ IS_REDIRECT_RE = re.compile(r'"wgIsRedirect"\s*:\s*true')
|
||||||
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
|
NAMESPACE_RE = re.compile(r'"wgCanonicalNamespace"\s*:\s*"([^"]*)"')
|
||||||
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
|
WG_TITLE_RE = re.compile(r'"wgTitle"\s*:\s*"([^"]+)"')
|
||||||
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
|
SHORT_SLUG_RE = re.compile(r"^[a-z0-9]+[0-9]*$")
|
||||||
|
|
||||||
UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
|
UNICODE_ESCAPE_RE = re.compile(r'\\u([0-9a-fA-F]{4})')
|
||||||
|
ERROR_PAGE_PATTERNS = [
|
||||||
|
# 5xx
|
||||||
|
"503 service unavailable",
|
||||||
|
"502 bad gateway",
|
||||||
|
"500 internal server error",
|
||||||
|
"504 gateway time",
|
||||||
|
|
||||||
|
# 4xx
|
||||||
|
"400 bad request",
|
||||||
|
"401 unauthorized",
|
||||||
|
"403 forbidden",
|
||||||
|
"404 not found",
|
||||||
|
"408 request time",
|
||||||
|
"419 page expired",
|
||||||
|
"429 too many requests",
|
||||||
|
|
||||||
|
# génériques
|
||||||
|
"temporarily busy",
|
||||||
|
"server error",
|
||||||
|
"internal error",
|
||||||
|
"page not found",
|
||||||
|
"request could not be satisfied",
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_error_page(page_html: str) -> bool:
|
||||||
|
text = page_html.lower()
|
||||||
|
|
||||||
|
if "<html" not in text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
for pattern in ERROR_PAGE_PATTERNS:
|
||||||
|
if pattern in text:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def decode_mediawiki_string(s: str) -> str:
|
def decode_mediawiki_string(s: str) -> str:
|
||||||
if not s:
|
if not s:
|
||||||
|
|
@ -177,7 +211,24 @@ for i, path in enumerate(files, 1):
|
||||||
|
|
||||||
article_id = extract_article_id(page_html)
|
article_id = extract_article_id(page_html)
|
||||||
if not article_id:
|
if not article_id:
|
||||||
ignored_pages.append(path.name)
|
if not is_error_page(page_html):
|
||||||
|
ignored_pages.append(path.name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
fake_title = normalize_title(path.stem)
|
||||||
|
|
||||||
|
all_variants[f"error::{fake_title}"].append({
|
||||||
|
"path": path,
|
||||||
|
"title": fake_title,
|
||||||
|
"canonical_key": fake_title,
|
||||||
|
"article_id": None,
|
||||||
|
"wg_title": None,
|
||||||
|
"redirect": False,
|
||||||
|
"is_category": False,
|
||||||
|
"is_listing_only": False,
|
||||||
|
"is_error": True,
|
||||||
|
})
|
||||||
|
problems.append(f"Error page detected: {path.name}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
title = extract_page_identity(page_html)
|
title = extract_page_identity(page_html)
|
||||||
|
|
@ -244,6 +295,7 @@ for i, path in enumerate(files, 1):
|
||||||
"redirect": is_redirect,
|
"redirect": is_redirect,
|
||||||
"is_category": is_category,
|
"is_category": is_category,
|
||||||
"is_listing_only": is_listing_only,
|
"is_listing_only": is_listing_only,
|
||||||
|
"is_error": False,
|
||||||
})
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -263,7 +315,7 @@ potential_tags = defaultdict(list)
|
||||||
equivalences = {}
|
equivalences = {}
|
||||||
category_renamed = 0
|
category_renamed = 0
|
||||||
category_not_chosen = 0
|
category_not_chosen = 0
|
||||||
|
error_pages = []
|
||||||
|
|
||||||
def slug_to_title(filename: str) -> str:
|
def slug_to_title(filename: str) -> str:
|
||||||
name = Path(filename).stem
|
name = Path(filename).stem
|
||||||
|
|
@ -302,7 +354,8 @@ def variant_score(v):
|
||||||
"_" in filename or
|
"_" in filename or
|
||||||
len(filename) > 40
|
len(filename) > 40
|
||||||
)
|
)
|
||||||
|
if v.get("is_error"):
|
||||||
|
return (True, True, True, True, 0, 9999, "zzz")
|
||||||
return (
|
return (
|
||||||
v["is_listing_only"],
|
v["is_listing_only"],
|
||||||
v["redirect"],
|
v["redirect"],
|
||||||
|
|
@ -337,12 +390,25 @@ for article_id, variants in all_variants.items():
|
||||||
potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
|
potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if all(v.get("is_error") for v in variants):
|
||||||
|
chosen_variant = variants[0]
|
||||||
|
canonical_pages[article_id] = {
|
||||||
|
"path": chosen_variant["path"],
|
||||||
|
"title": normalize_reference_key(chosen_variant["path"].stem),
|
||||||
|
"redirect": False,
|
||||||
|
}
|
||||||
|
error_pages.append(chosen_variant["path"].name)
|
||||||
|
continue
|
||||||
|
|
||||||
canonical_pages[article_id] = {
|
canonical_pages[article_id] = {
|
||||||
"path": chosen["path"],
|
"path": chosen["path"],
|
||||||
"title": canonical_slug,
|
"title": canonical_slug,
|
||||||
"redirect": chosen["redirect"],
|
"redirect": chosen["redirect"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if chosen.get("is_error"):
|
||||||
|
error_pages.append(chosen["path"].name)
|
||||||
|
|
||||||
for v in variants:
|
for v in variants:
|
||||||
if v["is_category"] and not v["is_listing_only"]:
|
if v["is_category"] and not v["is_listing_only"]:
|
||||||
# catégorie non choisie
|
# catégorie non choisie
|
||||||
|
|
@ -360,6 +426,7 @@ print(f"{len(canonical_pages)} pages canoniques")
|
||||||
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
|
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
|
||||||
print(f"{category_renamed} pages prefix 'category_*' renommées")
|
print(f"{category_renamed} pages prefix 'category_*' renommées")
|
||||||
print(f"{len(potential_tags)} potential_tags enregistrés")
|
print(f"{len(potential_tags)} potential_tags enregistrés")
|
||||||
|
print(f"{len(error_pages)} error_pages détectées")
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 3 — resolve redirects
|
# PASS 3 — resolve redirects
|
||||||
|
|
@ -479,6 +546,7 @@ registry = {
|
||||||
"equivalences": equivalences,
|
"equivalences": equivalences,
|
||||||
"potential_tags": potential_tags,
|
"potential_tags": potential_tags,
|
||||||
"ignored_pages": ignored_pages,
|
"ignored_pages": ignored_pages,
|
||||||
|
"error_pages": error_pages,
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTRY_PATH.parent.mkdir(exist_ok=True)
|
REGISTRY_PATH.parent.mkdir(exist_ok=True)
|
||||||
|
|
@ -495,6 +563,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
||||||
f.write(f"Equivalences: {len(equivalences)}\n")
|
f.write(f"Equivalences: {len(equivalences)}\n")
|
||||||
f.write(f"Ignored: {len(ignored_pages)}\n")
|
f.write(f"Ignored: {len(ignored_pages)}\n")
|
||||||
f.write(f"Problems: {len(problems)}\n\n")
|
f.write(f"Problems: {len(problems)}\n\n")
|
||||||
|
f.write(f"Error pages: {len(error_pages)}\n")
|
||||||
for p in problems:
|
for p in problems:
|
||||||
f.write(p + "\n")
|
f.write(p + "\n")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue