force 'league' equivalence

This commit is contained in:
Maxime Réaux 2026-04-10 14:39:10 +02:00
parent 313c92e928
commit 0aace3dfc8

View file

@ -53,6 +53,14 @@ ERROR_PAGE_PATTERNS = [
"page not found",
"request could not be satisfied",
]
LEAGUE_PREFIX_RE = re.compile(r"^league model\s*-\s*", re.I)
def strip_league_prefix(title: str) -> str:
title = normalize_title(title)
return LEAGUE_PREFIX_RE.sub("", title).strip()
def is_league_title(title: str) -> bool:
return bool(LEAGUE_PREFIX_RE.match(normalize_title(title)))
def is_error_page(page_html: str) -> bool:
text = page_html.lower()
@ -285,6 +293,8 @@ for i, path in enumerate(files, 1):
else:
redirects[key] = target
is_league = is_league_title(full_title)
base_no_league = strip_league_prefix(full_title)
canonical_key = normalize_reference_key(full_title)
all_variants[article_id].append({
"path": path,
@ -296,6 +306,8 @@ for i, path in enumerate(files, 1):
"is_category": is_category,
"is_listing_only": is_listing_only,
"is_error": False,
"is_league": is_league,
"base_no_league": base_no_league,
})
except Exception as e:
@ -354,11 +366,15 @@ def variant_score(v):
"_" in filename or
len(filename) > 40
)
league_penalty = v.get("is_league", False)
if v.get("is_error"):
return (True, True, True, True, 0, 9999, "zzz")
return (True, True, True, True, True, 0, 9999, "zzz")
return (
v["is_listing_only"],
v["redirect"],
league_penalty,
not is_short_slug,
long_title_penalty,
-similarity_score,
@ -379,7 +395,7 @@ for article_id, variants in all_variants.items():
variants_sorted = sorted(variants, key=variant_score)
chosen = variants_sorted[0]
canonical_slug = normalize_reference_key(chosen["path"].stem)
canonical_slug = normalize_reference_key(strip_league_prefix(chosen["path"].stem))
# categories listing-only
if chosen["is_listing_only"]:
@ -421,6 +437,10 @@ for article_id, variants in all_variants.items():
if v is not chosen:
filename_key = normalize_title(Path(v["path"]).stem)
add_equivalence(filename_key, canonical_slug)
if v.get("is_league"):
league_key = normalize_reference_key(v["canonical_key"])
base_key = normalize_reference_key(v["base_no_league"])
add_equivalence(league_key, base_key)
print(f"{len(canonical_pages)} pages canoniques")
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")