avoid overwrite homonym canonicals

This commit is contained in:
Maxime Réaux 2026-04-15 12:10:32 +02:00
parent 8e9289998b
commit 61d7f6b646
2 changed files with 19 additions and 11 deletions

View file

@ -9,7 +9,7 @@ from difflib import SequenceMatcher
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import unicodedata import unicodedata
SOURCE_DIR = Path("../test") SOURCE_DIR = Path("../original_index")
OUTPUT_DIR = Path("../output") OUTPUT_DIR = Path("../output")
PAGES_DIR = Path(OUTPUT_DIR / "pages") PAGES_DIR = Path(OUTPUT_DIR / "pages")
@ -535,34 +535,40 @@ def title_to_filename(title: str) -> str:
title.replace(" ", "_").replace("", "'").replace("", "'").replace("", '"').replace("", '"').casefold() + ".html" title.replace(" ", "_").replace("", "'").replace("", "'").replace("", '"').replace("", '"').casefold() + ".html"
) )
output_canonical_pages = {}
name_registry = {}
copied = 0 copied = 0
collision = 0
total = len(canonical_pages) total = len(canonical_pages)
for i, (article_id, data) in enumerate(canonical_pages.items(), 1): for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
src = data["path"] src = data["path"]
dst_name = title_to_filename(data["title"]) base_name = title_to_filename(data["title"])
dst = PAGES_DIR / dst_name if base_name in name_registry:
base_name = Path(base_name).stem
base_name = f"{base_name}__{article_id}.html"
collision += 1
problems.append(f"Resolved collision: {base_name} (from {src})")
name_registry[base_name] = article_id
dst = PAGES_DIR / base_name
try: try:
shutil.copy2(src, dst) shutil.copy2(src, dst)
canonical_pages[article_id] = dst_name output_canonical_pages[article_id] = base_name
copied += 1 copied += 1
except Exception as e: except Exception as e:
problems.append(f"Copy failed {src}: {e}") problems.append(f"Copy failed {src}: {e}")
if i % 200 == 0 or i == total: if i % 200 == 0 or i == total:
print(f"{i}/{total} copiés") print(f"{i}/{total} copiés")
print(f"{copied} pages copiées") print(f"{copied} pages copiées")
print(f"{collision} collisions détectées")
# -------------------------------------------------- # --------------------------------------------------
# SAVE REGISTRY # SAVE REGISTRY
# -------------------------------------------------- # --------------------------------------------------
registry = { registry = {
"canonical_pages": canonical_pages, "canonical_pages": output_canonical_pages,
"equivalences": equivalences, "equivalences": equivalences,
"potential_tags": potential_tags, "potential_tags": potential_tags,
"ignored_pages": ignored_pages, "ignored_pages": ignored_pages,
@ -579,7 +585,7 @@ with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
with open(REPORT_PATH, "w", encoding="utf-8") as f: with open(REPORT_PATH, "w", encoding="utf-8") as f:
f.write("=== MIGRATION REPORT ===\n") f.write("=== MIGRATION REPORT ===\n")
f.write(f"Canonical pages: {len(canonical_pages)}\n") f.write(f"Canonical pages: {len(output_canonical_pages)}\n")
f.write(f"Equivalences: {len(equivalences)}\n") f.write(f"Equivalences: {len(equivalences)}\n")
f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Ignored: {len(ignored_pages)}\n")
f.write(f"Problems: {len(problems)}\n\n") f.write(f"Problems: {len(problems)}\n\n")

View file

@ -9,7 +9,7 @@ import unicodedata
# PATHS # PATHS
# -------------------------------------------------- # --------------------------------------------------
PAGES_DIR = Path("../output/pages") PAGES_DIR = Path("../output/cleaned_pages")
REGISTRY_PATH = Path("../output/equivalence_registry.json") REGISTRY_PATH = Path("../output/equivalence_registry.json")
OUTPUT_DIR = Path("../output/link_scan") OUTPUT_DIR = Path("../output/link_scan")
@ -31,6 +31,8 @@ valid_targets = set(canonical_pages.values())
# -------------------------------------------------- # --------------------------------------------------
def normalize_title(title: str) -> str: def normalize_title(title: str) -> str:
if not title:
return
title = title.strip() title = title.strip()
title = unicodedata.normalize("NFKC", title) title = unicodedata.normalize("NFKC", title)
title = title.replace("_", " ") title = title.replace("_", " ")