From 4e473ba2c94084879d999bde51725d442a9c0120 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maxime=20R=C3=A9aux?= <maxime.reaux@advans-group.com>
Date: Wed, 15 Apr 2026 09:27:10 +0200
Subject: [PATCH] remove mk3/4 intro box

---
 extract_content.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/extract_content.py b/extract_content.py
index 93da286..18ceabc 100644
--- a/extract_content.py
+++ b/extract_content.py
@@ -206,6 +206,38 @@ def remove_orphan_headers(tag):
         if nxt.get_text(strip=True):
             continue
 
+def is_intro_rule_table(table):
+    text = table.get_text(" ", strip=True).lower()
+    has_structure_signal = (
+        table.name == "table"
+        and (
+            "wikitable" in (table.get("class") or [])
+            or table.get("width") == "100%"
+        )
+    )
+    has_intro_signal = any(k in text for k in [
+        "unlimited only",
+        "prime",
+        "legacy army",
+        "mark iii",
+        "mk4 icon",
+        "rest of this page"
+    ])
+    return has_structure_signal and has_intro_signal
+
+def remove_intro_rule_box(content):
+    for el in list(content.children):
+        if isinstance(el, str) and not el.strip():
+            continue
+        if getattr(el, "name", None) not in ["table", "div", "p"]:
+            break
+        if getattr(el, "name", None) == "table":
+            if is_intro_rule_table(el):
+                el.decompose()
+                continue
+        if getattr(el, "name", None) == "p":
+            break
+
 # ======================
 # CORE FUNCTIONS
 # ======================
@@ -229,6 +261,7 @@ def clean_html_file(input_path: Path, output_path: Path):
     if not content:
         print(f"[WARN] No content in {input_path.name}")
         return
+    remove_intro_rule_box(content)
 
     # Detect category pages
     title = soup.title.string if soup.title else ""