remove mk3/4 intro box
This commit is contained in:
parent
258886173e
commit
4e473ba2c9
1 changed files with 33 additions and 0 deletions
|
|
@ -206,6 +206,38 @@ def remove_orphan_headers(tag):
|
||||||
if nxt.get_text(strip=True):
|
if nxt.get_text(strip=True):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
def is_intro_rule_table(table):
|
||||||
|
text = table.get_text(" ", strip=True).lower()
|
||||||
|
has_structure_signal = (
|
||||||
|
table.name == "table"
|
||||||
|
and (
|
||||||
|
"wikitable" in (table.get("class") or [])
|
||||||
|
or table.get("width") == "100%"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
has_intro_signal = any(k in text for k in [
|
||||||
|
"unlimited only",
|
||||||
|
"prime",
|
||||||
|
"legacy army",
|
||||||
|
"mark iii",
|
||||||
|
"mk4 icon",
|
||||||
|
"rest of this page"
|
||||||
|
])
|
||||||
|
return has_structure_signal and has_intro_signal
|
||||||
|
|
||||||
|
def remove_intro_rule_box(content):
|
||||||
|
for el in list(content.children):
|
||||||
|
if isinstance(el, str) and not el.strip():
|
||||||
|
continue
|
||||||
|
if getattr(el, "name", None) not in ["table", "div", "p"]:
|
||||||
|
break
|
||||||
|
if getattr(el, "name", None) == "table":
|
||||||
|
if is_intro_rule_table(el):
|
||||||
|
el.decompose()
|
||||||
|
continue
|
||||||
|
if getattr(el, "name", None) == "p":
|
||||||
|
break
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
# CORE FUNCTIONS
|
# CORE FUNCTIONS
|
||||||
# ======================
|
# ======================
|
||||||
|
|
@ -229,6 +261,7 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
if not content:
|
if not content:
|
||||||
print(f"[WARN] No content in {input_path.name}")
|
print(f"[WARN] No content in {input_path.name}")
|
||||||
return
|
return
|
||||||
|
remove_intro_rule_box(content)
|
||||||
|
|
||||||
# Detect category pages
|
# Detect category pages
|
||||||
title = soup.title.string if soup.title else ""
|
title = soup.title.string if soup.title else ""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue