keep error pages with fallback content

2026-04-15 10:36:21 +02:00 · 2026-04-15 10:36:21 +02:00 · 8e9289998b
commit 8e9289998b
parent 4e473ba2c9
2 changed files with 63 additions and 14 deletions
--- a/scan_internal_links.py
+++ b/scan_internal_links.py
@ -3,6 +3,7 @@ import json
 import re
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, parse_qs, unquote
+import unicodedata

 # --------------------------------------------------
 # PATHS
@ -29,16 +30,14 @@ valid_targets = set(canonical_pages.values())
 # HELPERS
 # --------------------------------------------------

-def normalize_title(title: str | None):
-    if not title:
-        return None
-
-    title = unquote(title)
+def normalize_title(title: str) -> str:
+    title = title.strip()
+    title = unicodedata.normalize("NFKC", title)
    title = title.replace("_", " ")
-    title = re.sub(r"\s+", " ", title.strip())
+    title = title.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
+    title = re.sub(r"\s+", " ", title)
    return title.casefold()

-
 # -------------------------
 # Extract MediaWiki target
 # -------------------------