cleanup parenthesis&brackets

2026-04-15 08:08:00 +02:00 · 2026-04-15 08:08:00 +02:00 · 6eeabd7c9d
commit 6eeabd7c9d
parent c9fb3513ee
1 changed files with 102 additions and 8 deletions
--- a/extract_content.py
+++ b/extract_content.py
@ -1,4 +1,5 @@
 import os
 import re
 from pathlib import Path
 from bs4 import BeautifulSoup, Comment
@ -87,6 +88,103 @@ def is_in_rules_clarifications(el):
    title = prev.get_text(strip=True).lower()
    return "rules clarification" in title
 def remove_empty_bracket_groups(tag):
    for el in list(tag.find_all(True)):
        children = list(el.children)
        i = 0
        while i < len(children) - 1:
            curr = children[i]
            nxt = children[i + 1]
            def is_empty_text(n):
                return isinstance(n, str) and not n.strip()
            def is_char(n, chars):
                return isinstance(n, str) and n.strip() in chars
            def is_empty_span(n):
                return (
                    getattr(n, "name", None) == "span"
                    and not n.get_text(strip=True)
                )
            # ( )
            if (
                is_char(curr, {"("})
                and (
                    is_char(nxt, {")"})
                    or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))
                )
            ):
                curr.extract()
                nxt.extract()
                if i < len(children):
                    children = list(el.children)
                continue
            #  [ ]
            if is_char(curr, {"["}):
                j = i + 1
                found_close = False
                while j < len(children):
                    node = children[j]
                    if is_char(node, {"]"}):
                        found_close = True
                        break
                    if not (is_empty_text(node) or is_empty_span(node)):
                        break
                    j += 1
                if found_close:
                    # remove [ ... ]
                    curr.extract()
                    for k in range(i + 1, j + 1):
                        children[k].extract()
                    children = list(el.children)
                    continue
            i += 1
 def remove_inline_empty_brackets(tag):
    for node in list(tag.find_all(string=True)):
        text = str(node)
        # ( ) avec espaces ou retours ligne
        new_text = re.sub(r"\(\s*\)", "", text)
        # [ ] avec espaces ou spans déjà nettoyés
        new_text = re.sub(r"\[\s*\]", "", new_text)
        if new_text != text:
            node.replace_with(new_text)
 def remove_split_empty_parentheses(tag):
    for el in tag.find_all(True):
        children = list(el.children)
        i = 0
        while i < len(children):
            node = children[i]
            if isinstance(node, str) and "(" in node:
                idx = node.rfind("(")
                if node[idx:].strip() == "(":
                    j = i + 1
                    middle = []
                    while j < len(children):
                        nxt = children[j]
                        if isinstance(nxt, str) and not nxt.strip():
                            middle.append(nxt)
                            j += 1
                            continue
                        if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):
                            middle.append(nxt)
                            j += 1
                            continue
                        break
                    if j < len(children):
                        end = children[j]
                        if isinstance(end, str):
                            stripped = end.strip()
                            if stripped == ")":
                                new_text = node[:idx]
                                node.replace_with(new_text)
                                for m in middle:
                                    m.extract()
                                end.extract()
                                children = list(el.children)
                                continue
            i += 1
 # ======================
 # CORE FUNCTIONS
 # ======================
@ -165,12 +263,6 @@ def clean_html_file(input_path: Path, output_path: Path):
                continue
        a.decompose()
    for el in content.find_all(string=True):
        t = el.strip()
        if t in {"(", ")"}:
            if len(el.parent.get_text(strip=True)) <= 2:
                el.extract()
    # Remove MediaWiki show/hide links
    for el in content.select(
        ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
@ -182,8 +274,6 @@ def clean_html_file(input_path: Path, output_path: Path):
            "show/hide",
            "click expand",
            "expand to read",
            "( )",
            "[ ]"
        ]):
            el.extract()
@ -206,6 +296,10 @@ def clean_html_file(input_path: Path, output_path: Path):
        src = src.replace("../", "")
        img["src"] = src
    remove_empty_bracket_groups(content)
    remove_split_empty_parentheses(content)
    remove_inline_empty_brackets(content)
    # Output cleaned HTML
    cleaned_html = content.prettify()
    output_path.write_text(cleaned_html, encoding="utf-8")