avoid overwrite homonym canonicals
This commit is contained in:
parent
8e9289998b
commit
61d7f6b646
2 changed files with 19 additions and 11 deletions
|
|
@ -9,7 +9,7 @@ import unicodedata
|
|||
# PATHS
|
||||
# --------------------------------------------------
|
||||
|
||||
PAGES_DIR = Path("../output/pages")
|
||||
PAGES_DIR = Path("../output/cleaned_pages")
|
||||
REGISTRY_PATH = Path("../output/equivalence_registry.json")
|
||||
OUTPUT_DIR = Path("../output/link_scan")
|
||||
|
||||
|
|
@ -31,6 +31,8 @@ valid_targets = set(canonical_pages.values())
|
|||
# --------------------------------------------------
|
||||
|
||||
def normalize_title(title: str) -> str:
|
||||
if not title:
|
||||
return
|
||||
title = title.strip()
|
||||
title = unicodedata.normalize("NFKC", title)
|
||||
title = title.replace("_", " ")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue