force 'league' equivalence
This commit is contained in:
parent
313c92e928
commit
0aace3dfc8
1 changed files with 22 additions and 2 deletions
|
|
@ -53,6 +53,14 @@ ERROR_PAGE_PATTERNS = [
|
||||||
"page not found",
|
"page not found",
|
||||||
"request could not be satisfied",
|
"request could not be satisfied",
|
||||||
]
|
]
|
||||||
|
LEAGUE_PREFIX_RE = re.compile(r"^league model\s*-\s*", re.I)
|
||||||
|
|
||||||
|
def strip_league_prefix(title: str) -> str:
|
||||||
|
title = normalize_title(title)
|
||||||
|
return LEAGUE_PREFIX_RE.sub("", title).strip()
|
||||||
|
|
||||||
|
def is_league_title(title: str) -> bool:
|
||||||
|
return bool(LEAGUE_PREFIX_RE.match(normalize_title(title)))
|
||||||
|
|
||||||
def is_error_page(page_html: str) -> bool:
|
def is_error_page(page_html: str) -> bool:
|
||||||
text = page_html.lower()
|
text = page_html.lower()
|
||||||
|
|
@ -285,6 +293,8 @@ for i, path in enumerate(files, 1):
|
||||||
else:
|
else:
|
||||||
redirects[key] = target
|
redirects[key] = target
|
||||||
|
|
||||||
|
is_league = is_league_title(full_title)
|
||||||
|
base_no_league = strip_league_prefix(full_title)
|
||||||
canonical_key = normalize_reference_key(full_title)
|
canonical_key = normalize_reference_key(full_title)
|
||||||
all_variants[article_id].append({
|
all_variants[article_id].append({
|
||||||
"path": path,
|
"path": path,
|
||||||
|
|
@ -296,6 +306,8 @@ for i, path in enumerate(files, 1):
|
||||||
"is_category": is_category,
|
"is_category": is_category,
|
||||||
"is_listing_only": is_listing_only,
|
"is_listing_only": is_listing_only,
|
||||||
"is_error": False,
|
"is_error": False,
|
||||||
|
"is_league": is_league,
|
||||||
|
"base_no_league": base_no_league,
|
||||||
})
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -354,11 +366,15 @@ def variant_score(v):
|
||||||
"_" in filename or
|
"_" in filename or
|
||||||
len(filename) > 40
|
len(filename) > 40
|
||||||
)
|
)
|
||||||
|
|
||||||
|
league_penalty = v.get("is_league", False)
|
||||||
|
|
||||||
if v.get("is_error"):
|
if v.get("is_error"):
|
||||||
return (True, True, True, True, 0, 9999, "zzz")
|
return (True, True, True, True, True, 0, 9999, "zzz")
|
||||||
return (
|
return (
|
||||||
v["is_listing_only"],
|
v["is_listing_only"],
|
||||||
v["redirect"],
|
v["redirect"],
|
||||||
|
league_penalty,
|
||||||
not is_short_slug,
|
not is_short_slug,
|
||||||
long_title_penalty,
|
long_title_penalty,
|
||||||
-similarity_score,
|
-similarity_score,
|
||||||
|
|
@ -379,7 +395,7 @@ for article_id, variants in all_variants.items():
|
||||||
variants_sorted = sorted(variants, key=variant_score)
|
variants_sorted = sorted(variants, key=variant_score)
|
||||||
chosen = variants_sorted[0]
|
chosen = variants_sorted[0]
|
||||||
|
|
||||||
canonical_slug = normalize_reference_key(chosen["path"].stem)
|
canonical_slug = normalize_reference_key(strip_league_prefix(chosen["path"].stem))
|
||||||
|
|
||||||
# categories listing-only
|
# categories listing-only
|
||||||
if chosen["is_listing_only"]:
|
if chosen["is_listing_only"]:
|
||||||
|
|
@ -421,6 +437,10 @@ for article_id, variants in all_variants.items():
|
||||||
if v is not chosen:
|
if v is not chosen:
|
||||||
filename_key = normalize_title(Path(v["path"]).stem)
|
filename_key = normalize_title(Path(v["path"]).stem)
|
||||||
add_equivalence(filename_key, canonical_slug)
|
add_equivalence(filename_key, canonical_slug)
|
||||||
|
if v.get("is_league"):
|
||||||
|
league_key = normalize_reference_key(v["canonical_key"])
|
||||||
|
base_key = normalize_reference_key(v["base_no_league"])
|
||||||
|
add_equivalence(league_key, base_key)
|
||||||
|
|
||||||
print(f"{len(canonical_pages)} pages canoniques")
|
print(f"{len(canonical_pages)} pages canoniques")
|
||||||
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
|
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue