avoid overwrite homonym canonicals
This commit is contained in:
parent
8e9289998b
commit
61d7f6b646
2 changed files with 19 additions and 11 deletions
|
|
@ -9,7 +9,7 @@ from difflib import SequenceMatcher
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
SOURCE_DIR = Path("../test")
|
SOURCE_DIR = Path("../original_index")
|
||||||
OUTPUT_DIR = Path("../output")
|
OUTPUT_DIR = Path("../output")
|
||||||
|
|
||||||
PAGES_DIR = Path(OUTPUT_DIR / "pages")
|
PAGES_DIR = Path(OUTPUT_DIR / "pages")
|
||||||
|
|
@ -535,34 +535,40 @@ def title_to_filename(title: str) -> str:
|
||||||
title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html"
|
title.replace(" ", "_").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"').casefold() + ".html"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
output_canonical_pages = {}
|
||||||
|
name_registry = {}
|
||||||
copied = 0
|
copied = 0
|
||||||
|
collision = 0
|
||||||
total = len(canonical_pages)
|
total = len(canonical_pages)
|
||||||
|
|
||||||
for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
|
for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
|
||||||
|
|
||||||
src = data["path"]
|
src = data["path"]
|
||||||
dst_name = title_to_filename(data["title"])
|
base_name = title_to_filename(data["title"])
|
||||||
dst = PAGES_DIR / dst_name
|
if base_name in name_registry:
|
||||||
|
base_name = Path(base_name).stem
|
||||||
|
base_name = f"{base_name}__{article_id}.html"
|
||||||
|
collision += 1
|
||||||
|
problems.append(f"Resolved collision: {base_name} (from {src})")
|
||||||
|
name_registry[base_name] = article_id
|
||||||
|
dst = PAGES_DIR / base_name
|
||||||
try:
|
try:
|
||||||
shutil.copy2(src, dst)
|
shutil.copy2(src, dst)
|
||||||
canonical_pages[article_id] = dst_name
|
output_canonical_pages[article_id] = base_name
|
||||||
copied += 1
|
copied += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
problems.append(f"Copy failed {src}: {e}")
|
problems.append(f"Copy failed {src}: {e}")
|
||||||
|
|
||||||
if i % 200 == 0 or i == total:
|
if i % 200 == 0 or i == total:
|
||||||
print(f"{i}/{total} copiés")
|
print(f"{i}/{total} copiés")
|
||||||
|
|
||||||
print(f"{copied} pages copiées")
|
print(f"{copied} pages copiées")
|
||||||
|
print(f"{collision} collisions détectées")
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# SAVE REGISTRY
|
# SAVE REGISTRY
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
registry = {
|
registry = {
|
||||||
"canonical_pages": canonical_pages,
|
"canonical_pages": output_canonical_pages,
|
||||||
"equivalences": equivalences,
|
"equivalences": equivalences,
|
||||||
"potential_tags": potential_tags,
|
"potential_tags": potential_tags,
|
||||||
"ignored_pages": ignored_pages,
|
"ignored_pages": ignored_pages,
|
||||||
|
|
@ -579,7 +585,7 @@ with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
|
||||||
|
|
||||||
with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
||||||
f.write("=== MIGRATION REPORT ===\n")
|
f.write("=== MIGRATION REPORT ===\n")
|
||||||
f.write(f"Canonical pages: {len(canonical_pages)}\n")
|
f.write(f"Canonical pages: {len(output_canonical_pages)}\n")
|
||||||
f.write(f"Equivalences: {len(equivalences)}\n")
|
f.write(f"Equivalences: {len(equivalences)}\n")
|
||||||
f.write(f"Ignored: {len(ignored_pages)}\n")
|
f.write(f"Ignored: {len(ignored_pages)}\n")
|
||||||
f.write(f"Problems: {len(problems)}\n\n")
|
f.write(f"Problems: {len(problems)}\n\n")
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ import unicodedata
|
||||||
# PATHS
|
# PATHS
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
PAGES_DIR = Path("../output/pages")
|
PAGES_DIR = Path("../output/cleaned_pages")
|
||||||
REGISTRY_PATH = Path("../output/equivalence_registry.json")
|
REGISTRY_PATH = Path("../output/equivalence_registry.json")
|
||||||
OUTPUT_DIR = Path("../output/link_scan")
|
OUTPUT_DIR = Path("../output/link_scan")
|
||||||
|
|
||||||
|
|
@ -31,6 +31,8 @@ valid_targets = set(canonical_pages.values())
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
def normalize_title(title: str) -> str:
|
def normalize_title(title: str) -> str:
|
||||||
|
if not title:
|
||||||
|
return
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
title = unicodedata.normalize("NFKC", title)
|
title = unicodedata.normalize("NFKC", title)
|
||||||
title = title.replace("_", " ")
|
title = title.replace("_", " ")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue