From 6eeabd7c9db08be1e3186db2a0b02c7b0123651c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maxime=20R=C3=A9aux?= <maxime.reaux@advans-group.com>
Date: Wed, 15 Apr 2026 08:08:00 +0200
Subject: [PATCH] cleanup parenthesis&brackets

---
 extract_content.py | 110 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 102 insertions(+), 8 deletions(-)

diff --git a/extract_content.py b/extract_content.py
index 3c6e8f0..58c18ff 100644
--- a/extract_content.py
+++ b/extract_content.py
@@ -1,4 +1,5 @@
 import os
+import re
 from pathlib import Path
 from bs4 import BeautifulSoup, Comment
 
@@ -87,6 +88,103 @@ def is_in_rules_clarifications(el):
     title = prev.get_text(strip=True).lower()
     return "rules clarification" in title
 
+def remove_empty_bracket_groups(tag):
+    for el in list(tag.find_all(True)):
+        children = list(el.children)
+        i = 0
+        while i < len(children) - 1:
+            curr = children[i]
+            nxt = children[i + 1]
+            def is_empty_text(n):
+                return isinstance(n, str) and not n.strip()
+            def is_char(n, chars):
+                return isinstance(n, str) and n.strip() in chars
+            def is_empty_span(n):
+                return (
+                    getattr(n, "name", None) == "span"
+                    and not n.get_text(strip=True)
+                )
+            # ( )
+            if (
+                is_char(curr, {"("})
+                and (
+                    is_char(nxt, {")"})
+                    or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))
+                )
+            ):
+                curr.extract()
+                nxt.extract()
+                if i < len(children):
+                    children = list(el.children)
+                continue
+            #  [ ]
+            if is_char(curr, {"["}):
+                j = i + 1
+                found_close = False
+                while j < len(children):
+                    node = children[j]
+                    if is_char(node, {"]"}):
+                        found_close = True
+                        break
+                    if not (is_empty_text(node) or is_empty_span(node)):
+                        break
+                    j += 1
+                if found_close:
+                    # remove [ ... ]
+                    curr.extract()
+                    for k in range(i + 1, j + 1):
+                        children[k].extract()
+                    children = list(el.children)
+                    continue
+            i += 1
+
+
+def remove_inline_empty_brackets(tag):
+    for node in list(tag.find_all(string=True)):
+        text = str(node)
+        # ( ) avec espaces ou retours ligne
+        new_text = re.sub(r"\(\s*\)", "", text)
+        # [ ] avec espaces ou spans déjà nettoyés
+        new_text = re.sub(r"\[\s*\]", "", new_text)
+        if new_text != text:
+            node.replace_with(new_text)
+
+def remove_split_empty_parentheses(tag):
+    for el in tag.find_all(True):
+        children = list(el.children)
+        i = 0
+        while i < len(children):
+            node = children[i]
+            if isinstance(node, str) and "(" in node:
+                idx = node.rfind("(")
+                if node[idx:].strip() == "(":
+                    j = i + 1
+                    middle = []
+                    while j < len(children):
+                        nxt = children[j]
+                        if isinstance(nxt, str) and not nxt.strip():
+                            middle.append(nxt)
+                            j += 1
+                            continue
+                        if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):
+                            middle.append(nxt)
+                            j += 1
+                            continue
+                        break
+                    if j < len(children):
+                        end = children[j]
+                        if isinstance(end, str):
+                            stripped = end.strip()
+                            if stripped == ")":
+                                new_text = node[:idx]
+                                node.replace_with(new_text)
+                                for m in middle:
+                                    m.extract()
+                                end.extract()
+                                children = list(el.children)
+                                continue
+            i += 1
+
 # ======================
 # CORE FUNCTIONS
 # ======================
@@ -165,12 +263,6 @@ def clean_html_file(input_path: Path, output_path: Path):
                 continue
         a.decompose()
 
-    for el in content.find_all(string=True):
-        t = el.strip()
-        if t in {"(", ")"}:
-            if len(el.parent.get_text(strip=True)) <= 2:
-                el.extract()
-
     # Remove MediaWiki show/hide links
     for el in content.select(
         ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
@@ -182,8 +274,6 @@ def clean_html_file(input_path: Path, output_path: Path):
             "show/hide",
             "click expand",
             "expand to read",
-            "( )",
-            "[ ]"
         ]):
             el.extract()
 
@@ -206,6 +296,10 @@ def clean_html_file(input_path: Path, output_path: Path):
         src = src.replace("../", "")
         img["src"] = src
 
+    remove_empty_bracket_groups(content)
+    remove_split_empty_parentheses(content)
+    remove_inline_empty_brackets(content)
+
     # Output cleaned HTML
     cleaned_html = content.prettify()
     output_path.write_text(cleaned_html, encoding="utf-8")