force 'league' equivalence
This commit is contained in:
parent
313c92e928
commit
0aace3dfc8
1 changed files with 22 additions and 2 deletions
|
|
@ -53,6 +53,14 @@ ERROR_PAGE_PATTERNS = [
|
|||
"page not found",
|
||||
"request could not be satisfied",
|
||||
]
|
||||
LEAGUE_PREFIX_RE = re.compile(r"^league model\s*-\s*", re.I)
|
||||
|
||||
def strip_league_prefix(title: str) -> str:
|
||||
title = normalize_title(title)
|
||||
return LEAGUE_PREFIX_RE.sub("", title).strip()
|
||||
|
||||
def is_league_title(title: str) -> bool:
|
||||
return bool(LEAGUE_PREFIX_RE.match(normalize_title(title)))
|
||||
|
||||
def is_error_page(page_html: str) -> bool:
|
||||
text = page_html.lower()
|
||||
|
|
@ -285,6 +293,8 @@ for i, path in enumerate(files, 1):
|
|||
else:
|
||||
redirects[key] = target
|
||||
|
||||
is_league = is_league_title(full_title)
|
||||
base_no_league = strip_league_prefix(full_title)
|
||||
canonical_key = normalize_reference_key(full_title)
|
||||
all_variants[article_id].append({
|
||||
"path": path,
|
||||
|
|
@ -296,6 +306,8 @@ for i, path in enumerate(files, 1):
|
|||
"is_category": is_category,
|
||||
"is_listing_only": is_listing_only,
|
||||
"is_error": False,
|
||||
"is_league": is_league,
|
||||
"base_no_league": base_no_league,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -354,11 +366,15 @@ def variant_score(v):
|
|||
"_" in filename or
|
||||
len(filename) > 40
|
||||
)
|
||||
|
||||
league_penalty = v.get("is_league", False)
|
||||
|
||||
if v.get("is_error"):
|
||||
return (True, True, True, True, 0, 9999, "zzz")
|
||||
return (True, True, True, True, True, 0, 9999, "zzz")
|
||||
return (
|
||||
v["is_listing_only"],
|
||||
v["redirect"],
|
||||
league_penalty,
|
||||
not is_short_slug,
|
||||
long_title_penalty,
|
||||
-similarity_score,
|
||||
|
|
@ -379,7 +395,7 @@ for article_id, variants in all_variants.items():
|
|||
variants_sorted = sorted(variants, key=variant_score)
|
||||
chosen = variants_sorted[0]
|
||||
|
||||
canonical_slug = normalize_reference_key(chosen["path"].stem)
|
||||
canonical_slug = normalize_reference_key(strip_league_prefix(chosen["path"].stem))
|
||||
|
||||
# categories listing-only
|
||||
if chosen["is_listing_only"]:
|
||||
|
|
@ -421,6 +437,10 @@ for article_id, variants in all_variants.items():
|
|||
if v is not chosen:
|
||||
filename_key = normalize_title(Path(v["path"]).stem)
|
||||
add_equivalence(filename_key, canonical_slug)
|
||||
if v.get("is_league"):
|
||||
league_key = normalize_reference_key(v["canonical_key"])
|
||||
base_key = normalize_reference_key(v["base_no_league"])
|
||||
add_equivalence(league_key, base_key)
|
||||
|
||||
print(f"{len(canonical_pages)} pages canoniques")
|
||||
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue