avoid overwrite homonym canonicals

This commit is contained in:
Maxime Réaux 2026-04-15 12:10:32 +02:00
parent 8e9289998b
commit 61d7f6b646
2 changed files with 19 additions and 11 deletions

View file

@ -9,7 +9,7 @@ import unicodedata
# PATHS
# --------------------------------------------------
PAGES_DIR = Path("../output/pages")
PAGES_DIR = Path("../output/cleaned_pages")
REGISTRY_PATH = Path("../output/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan")
@ -31,6 +31,8 @@ valid_targets = set(canonical_pages.values())
# --------------------------------------------------
def normalize_title(title: str) -> str:
if not title:
return
title = title.strip()
title = unicodedata.normalize("NFKC", title)
title = title.replace("_", " ")