remove mk3/4 intro box

This commit is contained in:
Maxime Réaux 2026-04-15 09:27:10 +02:00
parent 258886173e
commit 4e473ba2c9

View file

@ -206,6 +206,38 @@ def remove_orphan_headers(tag):
if nxt.get_text(strip=True):
continue
def is_intro_rule_table(table):
text = table.get_text(" ", strip=True).lower()
has_structure_signal = (
table.name == "table"
and (
"wikitable" in (table.get("class") or [])
or table.get("width") == "100%"
)
)
has_intro_signal = any(k in text for k in [
"unlimited only",
"prime",
"legacy army",
"mark iii",
"mk4 icon",
"rest of this page"
])
return has_structure_signal and has_intro_signal
def remove_intro_rule_box(content):
for el in list(content.children):
if isinstance(el, str) and not el.strip():
continue
if getattr(el, "name", None) not in ["table", "div", "p"]:
break
if getattr(el, "name", None) == "table":
if is_intro_rule_table(el):
el.decompose()
continue
if getattr(el, "name", None) == "p":
break
# ======================
# CORE FUNCTIONS
# ======================
@ -229,6 +261,7 @@ def clean_html_file(input_path: Path, output_path: Path):
if not content:
print(f"[WARN] No content in {input_path.name}")
return
remove_intro_rule_box(content)
# Detect category pages
title = soup.title.string if soup.title else ""