From 4e473ba2c94084879d999bde51725d442a9c0120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Wed, 15 Apr 2026 09:27:10 +0200 Subject: [PATCH] remove mk3/4 intro box --- extract_content.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/extract_content.py b/extract_content.py index 93da286..18ceabc 100644 --- a/extract_content.py +++ b/extract_content.py @@ -206,6 +206,38 @@ def remove_orphan_headers(tag): if nxt.get_text(strip=True): continue +def is_intro_rule_table(table): + text = table.get_text(" ", strip=True).lower() + has_structure_signal = ( + table.name == "table" + and ( + "wikitable" in (table.get("class") or []) + or table.get("width") == "100%" + ) + ) + has_intro_signal = any(k in text for k in [ + "unlimited only", + "prime", + "legacy army", + "mark iii", + "mk4 icon", + "rest of this page" + ]) + return has_structure_signal and has_intro_signal + +def remove_intro_rule_box(content): + for el in list(content.children): + if isinstance(el, str) and not el.strip(): + continue + if getattr(el, "name", None) not in ["table", "div", "p"]: + break + if getattr(el, "name", None) == "table": + if is_intro_rule_table(el): + el.decompose() + continue + if getattr(el, "name", None) == "p": + break + # ====================== # CORE FUNCTIONS # ====================== @@ -229,6 +261,7 @@ def clean_html_file(input_path: Path, output_path: Path): if not content: print(f"[WARN] No content in {input_path.name}") return + remove_intro_rule_box(content) # Detect category pages title = soup.title.string if soup.title else ""