From 61d7f6b6463076d8c9b7845d2f2d3e2d1df0f2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Wed, 15 Apr 2026 12:10:32 +0200 Subject: [PATCH] avoid overwrite homonym canonicals --- prepare_pages_and_registry.py | 26 ++++++++++++++++---------- scan_internal_links.py | 4 +++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index f6517c6..1f0b471 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -9,7 +9,7 @@ from difflib import SequenceMatcher from bs4 import BeautifulSoup import unicodedata -SOURCE_DIR = Path("../test") +SOURCE_DIR = Path("../original_index") OUTPUT_DIR = Path("../output") PAGES_DIR = Path(OUTPUT_DIR / "pages") @@ -535,34 +535,40 @@ def title_to_filename(title: str) -> str: title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html" ) - +output_canonical_pages = {} +name_registry = {} copied = 0 +collision = 0 total = len(canonical_pages) for i, (article_id, data) in enumerate(canonical_pages.items(), 1): - src = data["path"] - dst_name = title_to_filename(data["title"]) - dst = PAGES_DIR / dst_name - + base_name = title_to_filename(data["title"]) + if base_name in name_registry: + base_name = Path(base_name).stem + base_name = f"{base_name}__{article_id}.html" + collision += 1 + problems.append(f"Resolved collision: {base_name} (from {src})") + name_registry[base_name] = article_id + dst = PAGES_DIR / base_name try: shutil.copy2(src, dst) - canonical_pages[article_id] = dst_name + output_canonical_pages[article_id] = base_name copied += 1 except Exception as e: problems.append(f"Copy failed {src}: {e}") - if i % 200 == 0 or i == total: print(f"{i}/{total} copiés") print(f"{copied} pages copiées") +print(f"{collision} collisions détectées") # -------------------------------------------------- # SAVE REGISTRY # -------------------------------------------------- registry = { - "canonical_pages": canonical_pages, + "canonical_pages": output_canonical_pages, "equivalences": equivalences, "potential_tags": potential_tags, "ignored_pages": ignored_pages, @@ -579,7 +585,7 @@ with open(REGISTRY_PATH, "w", encoding="utf-8") as f: with open(REPORT_PATH, "w", encoding="utf-8") as f: f.write("=== MIGRATION REPORT ===\n") - f.write(f"Canonical pages: {len(canonical_pages)}\n") + f.write(f"Canonical pages: {len(output_canonical_pages)}\n") f.write(f"Equivalences: {len(equivalences)}\n") f.write(f"Ignored: {len(ignored_pages)}\n") f.write(f"Problems: {len(problems)}\n\n") diff --git a/scan_internal_links.py b/scan_internal_links.py index 25fc6d7..385e9a7 100644 --- a/scan_internal_links.py +++ b/scan_internal_links.py @@ -9,7 +9,7 @@ import unicodedata # PATHS # -------------------------------------------------- -PAGES_DIR = Path("../output/pages") +PAGES_DIR = Path("../output/cleaned_pages") REGISTRY_PATH = Path("../output/equivalence_registry.json") OUTPUT_DIR = Path("../output/link_scan") @@ -31,6 +31,8 @@ valid_targets = set(canonical_pages.values()) # -------------------------------------------------- def normalize_title(title: str) -> str: + if not title: + return title = title.strip() title = unicodedata.normalize("NFKC", title) title = title.replace("_", " ")