From 0aace3dfc86571d6e930c372880c486abab1a9b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Fri, 10 Apr 2026 14:39:10 +0200 Subject: [PATCH] force 'league' equivalence --- prepare_pages_and_registry.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index 52416d2..222f4d7 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -53,6 +53,14 @@ ERROR_PAGE_PATTERNS = [ "page not found", "request could not be satisfied", ] +LEAGUE_PREFIX_RE = re.compile(r"^league model\s*-\s*", re.I) + +def strip_league_prefix(title: str) -> str: + title = normalize_title(title) + return LEAGUE_PREFIX_RE.sub("", title).strip() + +def is_league_title(title: str) -> bool: + return bool(LEAGUE_PREFIX_RE.match(normalize_title(title))) def is_error_page(page_html: str) -> bool: text = page_html.lower() @@ -285,6 +293,8 @@ for i, path in enumerate(files, 1): else: redirects[key] = target + is_league = is_league_title(full_title) + base_no_league = strip_league_prefix(full_title) canonical_key = normalize_reference_key(full_title) all_variants[article_id].append({ "path": path, @@ -296,6 +306,8 @@ for i, path in enumerate(files, 1): "is_category": is_category, "is_listing_only": is_listing_only, "is_error": False, + "is_league": is_league, + "base_no_league": base_no_league, }) except Exception as e: @@ -354,11 +366,15 @@ def variant_score(v): "_" in filename or len(filename) > 40 ) + + league_penalty = v.get("is_league", False) + if v.get("is_error"): - return (True, True, True, True, 0, 9999, "zzz") + return (True, True, True, True, True, 0, 9999, "zzz") return ( v["is_listing_only"], v["redirect"], + league_penalty, not is_short_slug, long_title_penalty, -similarity_score, @@ -379,7 +395,7 @@ for article_id, variants in all_variants.items(): variants_sorted = sorted(variants, key=variant_score) chosen = variants_sorted[0] - canonical_slug = normalize_reference_key(chosen["path"].stem) + canonical_slug = normalize_reference_key(strip_league_prefix(chosen["path"].stem)) # categories listing-only if chosen["is_listing_only"]: @@ -421,6 +437,10 @@ for article_id, variants in all_variants.items(): if v is not chosen: filename_key = normalize_title(Path(v["path"]).stem) add_equivalence(filename_key, canonical_slug) + if v.get("is_league"): + league_key = normalize_reference_key(v["canonical_key"]) + base_key = normalize_reference_key(v["base_no_league"]) + add_equivalence(league_key, base_key) print(f"{len(canonical_pages)} pages canoniques") print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")