capture potential_tags
This commit is contained in:
parent
7f019ed98c
commit
556d6f1e03
1 changed files with 102 additions and 54 deletions
|
|
@ -6,6 +6,7 @@ import html
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
SOURCE_DIR = Path("../original_index")
|
SOURCE_DIR = Path("../original_index")
|
||||||
OUTPUT_DIR = Path("../output")
|
OUTPUT_DIR = Path("../output")
|
||||||
|
|
@ -50,18 +51,18 @@ def extract_wg_page_name(page_html: str) -> str | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_page_identity(html: str):
|
def extract_page_identity(page_html: str):
|
||||||
page = extract_wg_page_name(html)
|
page = extract_wg_page_name(page_html)
|
||||||
if page:
|
if page:
|
||||||
return page
|
return page
|
||||||
m = re.search(r"<title>(.*?) -", html, re.I)
|
m = re.search(r"<title>(.*?) -", page_html, re.I)
|
||||||
if m:
|
if m:
|
||||||
return html.unescape(m.group(1))
|
return html.unescape(m.group(1))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_article_id(html: str) -> int | None:
|
def extract_article_id(page_html: str) -> int | None:
|
||||||
m = ARTICLE_ID_RE.search(html)
|
m = ARTICLE_ID_RE.search(page_html)
|
||||||
if m:
|
if m:
|
||||||
aid = int(m.group(1))
|
aid = int(m.group(1))
|
||||||
if aid > 0:
|
if aid > 0:
|
||||||
|
|
@ -76,8 +77,8 @@ def extract_internal_redirect(page_html: str):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_namespace(html: str) -> str:
|
def extract_namespace(page_html: str) -> str:
|
||||||
m = NAMESPACE_RE.search(html)
|
m = NAMESPACE_RE.search(page_html)
|
||||||
if m:
|
if m:
|
||||||
return m.group(1)
|
return m.group(1)
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -102,6 +103,32 @@ def normalize_reference_key(key: str) -> str:
|
||||||
return key.strip()
|
return key.strip()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def has_editorial_content(html_page: str) -> bool:
|
||||||
|
soup = BeautifulSoup(html_page, "html.parser")
|
||||||
|
|
||||||
|
content = soup.find(id="mw-content-text")
|
||||||
|
if not content:
|
||||||
|
return False
|
||||||
|
|
||||||
|
auto = content.select_one(".mw-category-generated")
|
||||||
|
if not auto:
|
||||||
|
return True # pas une catégorie auto
|
||||||
|
|
||||||
|
# texte AVANT le listing
|
||||||
|
editorial_text = ""
|
||||||
|
|
||||||
|
for child in content.children:
|
||||||
|
if getattr(child, "get", None) and "mw-category-generated" in child.get("class", []):
|
||||||
|
break
|
||||||
|
editorial_text += child.get_text(" ", strip=True)
|
||||||
|
|
||||||
|
editorial_text = editorial_text.strip()
|
||||||
|
|
||||||
|
return len(editorial_text) > 200
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# Registry structures
|
# Registry structures
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
@ -146,6 +173,8 @@ for i, path in enumerate(files, 1):
|
||||||
base_title = norm
|
base_title = norm
|
||||||
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
is_redirect = bool(IS_REDIRECT_RE.search(page_html))
|
||||||
is_category = ns == "Category" or norm.startswith("category:")
|
is_category = ns == "Category" or norm.startswith("category:")
|
||||||
|
has_content = has_editorial_content(page_html)
|
||||||
|
is_listing_only = is_category and not has_content
|
||||||
wg_title = extract_wg_title(page_html)
|
wg_title = extract_wg_title(page_html)
|
||||||
|
|
||||||
# Categories
|
# Categories
|
||||||
|
|
@ -178,11 +207,12 @@ for i, path in enumerate(files, 1):
|
||||||
all_variants[article_id].append({
|
all_variants[article_id].append({
|
||||||
"path": path,
|
"path": path,
|
||||||
"title": base_title,
|
"title": base_title,
|
||||||
"canonical_key": full_title,
|
"canonical_key": canonical_key,
|
||||||
"article_id": article_id,
|
"article_id": article_id,
|
||||||
"wg_title": normalize_title(wg_title) if wg_title else None,
|
"wg_title": normalize_title(wg_title) if wg_title else None,
|
||||||
"redirect": is_redirect,
|
"redirect": is_redirect,
|
||||||
"is_category": is_category,
|
"is_category": is_category,
|
||||||
|
"is_listing_only": is_listing_only,
|
||||||
})
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -197,9 +227,10 @@ print("Variants collected:", len(all_variants))
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
canonical_pages = {}
|
canonical_pages = {}
|
||||||
|
potential_tags = defaultdict(list)
|
||||||
equivalences = {}
|
equivalences = {}
|
||||||
category_replaced = 0
|
category_renamed = 0
|
||||||
nb_all_cat = 0
|
category_not_chosen = 0
|
||||||
|
|
||||||
|
|
||||||
def slug_to_title(filename: str) -> str:
|
def slug_to_title(filename: str) -> str:
|
||||||
|
|
@ -241,7 +272,7 @@ def variant_score(v):
|
||||||
)
|
)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
v["is_category"],
|
v["is_listing_only"],
|
||||||
v["redirect"],
|
v["redirect"],
|
||||||
not is_short_slug,
|
not is_short_slug,
|
||||||
long_title_penalty,
|
long_title_penalty,
|
||||||
|
|
@ -250,35 +281,6 @@ def variant_score(v):
|
||||||
filename.lower(),
|
filename.lower(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
for article_id, variants in all_variants.items():
|
|
||||||
|
|
||||||
# tri déterministe
|
|
||||||
variants_sorted = sorted(variants, key=variant_score)
|
|
||||||
print(f"variants_sorted: {variants_sorted}")
|
|
||||||
|
|
||||||
chosen = variants_sorted[0]
|
|
||||||
|
|
||||||
if all(v["is_category"] for v in variants):
|
|
||||||
nb_all_cat += 1
|
|
||||||
|
|
||||||
if chosen["is_category"]:
|
|
||||||
category_replaced += 1
|
|
||||||
|
|
||||||
canonical_title = normalize_reference_key(chosen["title"])
|
|
||||||
|
|
||||||
canonical_pages[article_id] = {
|
|
||||||
"path": chosen["path"],
|
|
||||||
"title": canonical_title,
|
|
||||||
"redirect": chosen["redirect"],
|
|
||||||
}
|
|
||||||
|
|
||||||
# équivalences
|
|
||||||
for v in variants:
|
|
||||||
equivalences[v["canonical_key"]] = chosen["title"]
|
|
||||||
|
|
||||||
equivalences.clear()
|
|
||||||
|
|
||||||
def add_equivalence(k, v):
|
def add_equivalence(k, v):
|
||||||
k = normalize_reference_key(k)
|
k = normalize_reference_key(k)
|
||||||
v = normalize_reference_key(v)
|
v = normalize_reference_key(v)
|
||||||
|
|
@ -286,16 +288,48 @@ def add_equivalence(k, v):
|
||||||
if k != v:
|
if k != v:
|
||||||
equivalences[k] = v
|
equivalences[k] = v
|
||||||
|
|
||||||
|
|
||||||
for article_id, variants in all_variants.items():
|
for article_id, variants in all_variants.items():
|
||||||
canonical_title = canonical_pages[article_id]["title"]
|
variants_sorted = sorted(variants, key=variant_score)
|
||||||
canonical_slug = Path(canonical_pages[article_id]["path"]).stem
|
chosen = variants_sorted[0]
|
||||||
|
|
||||||
|
canonical_slug = normalize_reference_key(chosen["path"].stem)
|
||||||
|
|
||||||
|
# categories listing-only
|
||||||
|
if chosen["is_listing_only"]:
|
||||||
|
tag_name = normalize_reference_key(chosen["title"])
|
||||||
for v in variants:
|
for v in variants:
|
||||||
add_equivalence(v["canonical_key"], canonical_slug)
|
potential_tags[tag_name].append(normalize_title(v["path"].stem))
|
||||||
|
if v["wg_title"]:
|
||||||
|
potential_tags[tag_name].append(normalize_reference_key(v["wg_title"]))
|
||||||
|
continue
|
||||||
|
|
||||||
|
canonical_pages[article_id] = {
|
||||||
|
"path": chosen["path"],
|
||||||
|
"title": canonical_slug,
|
||||||
|
"redirect": chosen["redirect"],
|
||||||
|
}
|
||||||
|
|
||||||
|
if chosen["wg_title"]:
|
||||||
|
add_equivalence(chosen["wg_title"], canonical_slug)
|
||||||
|
|
||||||
|
for v in variants:
|
||||||
|
if v["is_category"] and not v["is_listing_only"]:
|
||||||
|
# catégorie non choisie
|
||||||
|
if v is not chosen:
|
||||||
|
category_not_chosen += 1
|
||||||
|
# catégorie choisie mais qui est une category_* → renommée
|
||||||
|
elif chosen["path"].stem.lower().startswith("category"):
|
||||||
|
category_renamed += 1
|
||||||
|
|
||||||
|
if v is not chosen:
|
||||||
filename_key = normalize_title(Path(v["path"]).stem)
|
filename_key = normalize_title(Path(v["path"]).stem)
|
||||||
add_equivalence(filename_key, canonical_slug)
|
add_equivalence(filename_key, canonical_slug)
|
||||||
|
|
||||||
print(f"Nombre de cas avec toutes variantes sont des categories : {nb_all_cat}")
|
print(f"{len(canonical_pages)} pages canoniques")
|
||||||
print(f"{category_replaced} 'category_*' remplacées par leur version de base")
|
print(f"{category_not_chosen} pages homonymes 'category_*' non retenues")
|
||||||
|
print(f"{category_renamed} pages prefix 'category_*' renommées")
|
||||||
|
print(f"{len(potential_tags)} potential_tags enregistrés")
|
||||||
|
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 3 — resolve redirects
|
# PASS 3 — resolve redirects
|
||||||
|
|
@ -320,6 +354,16 @@ redirects.clear()
|
||||||
# PASS 4 — normalisation finale des equivalences
|
# PASS 4 — normalisation finale des equivalences
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
||||||
|
def resolve_equivalence(key):
|
||||||
|
seen = set()
|
||||||
|
while key in equivalences and key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
key = equivalences[key]
|
||||||
|
return key
|
||||||
|
|
||||||
|
for k in list(equivalences):
|
||||||
|
equivalences[k] = resolve_equivalence(equivalences[k])
|
||||||
|
|
||||||
valid_titles = {
|
valid_titles = {
|
||||||
data["title"]
|
data["title"]
|
||||||
for data in canonical_pages.values()
|
for data in canonical_pages.values()
|
||||||
|
|
@ -338,10 +382,14 @@ for k, v in list(equivalences.items()):
|
||||||
for k, v in equivalences.items():
|
for k, v in equivalences.items():
|
||||||
if v not in valid_titles:
|
if v not in valid_titles:
|
||||||
problems.append(f"Non canonical mapping: {k} -> {v}")
|
problems.append(f"Non canonical mapping: {k} -> {v}")
|
||||||
|
|
||||||
equivalences = {
|
equivalences = {
|
||||||
k: v for k, v in equivalences.items()
|
k: v for k, v in equivalences.items()
|
||||||
if k != v
|
if k != v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for k in list(equivalences):
|
||||||
|
equivalences[k] = resolve_equivalence(equivalences[k])
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
# PASS 5 — copie des pages canoniques
|
# PASS 5 — copie des pages canoniques
|
||||||
# --------------------------------------------------
|
# --------------------------------------------------
|
||||||
|
|
@ -356,16 +404,15 @@ def title_to_filename(title: str) -> str:
|
||||||
copied = 0
|
copied = 0
|
||||||
total = len(canonical_pages)
|
total = len(canonical_pages)
|
||||||
|
|
||||||
for i, (key, data) in enumerate(canonical_pages.items(), 1):
|
for i, (article_id, data) in enumerate(canonical_pages.items(), 1):
|
||||||
|
|
||||||
src = data["path"]
|
src = data["path"]
|
||||||
|
dst_name = title_to_filename(data["title"])
|
||||||
dst_name = sanitize_filename(src.name.casefold())
|
|
||||||
dst = PAGES_DIR / dst_name
|
dst = PAGES_DIR / dst_name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
shutil.copy2(src, dst)
|
shutil.copy2(src, dst)
|
||||||
canonical_pages[key] = dst_name
|
canonical_pages[article_id] = dst_name
|
||||||
copied += 1
|
copied += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
problems.append(f"Copy failed {src}: {e}")
|
problems.append(f"Copy failed {src}: {e}")
|
||||||
|
|
@ -383,6 +430,7 @@ registry = {
|
||||||
"canonical_pages": canonical_pages,
|
"canonical_pages": canonical_pages,
|
||||||
"equivalences": equivalences,
|
"equivalences": equivalences,
|
||||||
"redirects": redirects,
|
"redirects": redirects,
|
||||||
|
"potential_tags": potential_tags,
|
||||||
"ignored_pages": ignored_pages,
|
"ignored_pages": ignored_pages,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -401,7 +449,7 @@ with open(REPORT_PATH, "w", encoding="utf-8") as f:
|
||||||
f.write(f"Redirects: {len(redirects)}\n")
|
f.write(f"Redirects: {len(redirects)}\n")
|
||||||
f.write(f"Ignored: {len(ignored_pages)}\n")
|
f.write(f"Ignored: {len(ignored_pages)}\n")
|
||||||
f.write(f"Problems: {len(problems)}\n\n")
|
f.write(f"Problems: {len(problems)}\n\n")
|
||||||
for p in problems[:200]:
|
for p in problems:
|
||||||
f.write(p + "\n")
|
f.write(p + "\n")
|
||||||
|
|
||||||
print("\n✅ PREPARATION COMPLETE")
|
print("\n✅ PREPARATION COMPLETE")
|
||||||
Loading…
Add table
Add a link
Reference in a new issue