remove mk3/4 intro box
This commit is contained in:
parent
258886173e
commit
4e473ba2c9
1 changed files with 33 additions and 0 deletions
|
|
@ -206,6 +206,38 @@ def remove_orphan_headers(tag):
|
|||
if nxt.get_text(strip=True):
|
||||
continue
|
||||
|
||||
def is_intro_rule_table(table):
|
||||
text = table.get_text(" ", strip=True).lower()
|
||||
has_structure_signal = (
|
||||
table.name == "table"
|
||||
and (
|
||||
"wikitable" in (table.get("class") or [])
|
||||
or table.get("width") == "100%"
|
||||
)
|
||||
)
|
||||
has_intro_signal = any(k in text for k in [
|
||||
"unlimited only",
|
||||
"prime",
|
||||
"legacy army",
|
||||
"mark iii",
|
||||
"mk4 icon",
|
||||
"rest of this page"
|
||||
])
|
||||
return has_structure_signal and has_intro_signal
|
||||
|
||||
def remove_intro_rule_box(content):
|
||||
for el in list(content.children):
|
||||
if isinstance(el, str) and not el.strip():
|
||||
continue
|
||||
if getattr(el, "name", None) not in ["table", "div", "p"]:
|
||||
break
|
||||
if getattr(el, "name", None) == "table":
|
||||
if is_intro_rule_table(el):
|
||||
el.decompose()
|
||||
continue
|
||||
if getattr(el, "name", None) == "p":
|
||||
break
|
||||
|
||||
# ======================
|
||||
# CORE FUNCTIONS
|
||||
# ======================
|
||||
|
|
@ -229,6 +261,7 @@ def clean_html_file(input_path: Path, output_path: Path):
|
|||
if not content:
|
||||
print(f"[WARN] No content in {input_path.name}")
|
||||
return
|
||||
remove_intro_rule_box(content)
|
||||
|
||||
# Detect category pages
|
||||
title = soup.title.string if soup.title else ""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue