From 6eeabd7c9db08be1e3186db2a0b02c7b0123651c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Wed, 15 Apr 2026 08:08:00 +0200 Subject: [PATCH] cleanup parenthesis&brackets --- extract_content.py | 110 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 8 deletions(-) diff --git a/extract_content.py b/extract_content.py index 3c6e8f0..58c18ff 100644 --- a/extract_content.py +++ b/extract_content.py @@ -1,4 +1,5 @@ import os +import re from pathlib import Path from bs4 import BeautifulSoup, Comment @@ -87,6 +88,103 @@ def is_in_rules_clarifications(el): title = prev.get_text(strip=True).lower() return "rules clarification" in title +def remove_empty_bracket_groups(tag): + for el in list(tag.find_all(True)): + children = list(el.children) + i = 0 + while i < len(children) - 1: + curr = children[i] + nxt = children[i + 1] + def is_empty_text(n): + return isinstance(n, str) and not n.strip() + def is_char(n, chars): + return isinstance(n, str) and n.strip() in chars + def is_empty_span(n): + return ( + getattr(n, "name", None) == "span" + and not n.get_text(strip=True) + ) + # ( ) + if ( + is_char(curr, {"("}) + and ( + is_char(nxt, {")"}) + or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"})) + ) + ): + curr.extract() + nxt.extract() + if i < len(children): + children = list(el.children) + continue + # [ ] + if is_char(curr, {"["}): + j = i + 1 + found_close = False + while j < len(children): + node = children[j] + if is_char(node, {"]"}): + found_close = True + break + if not (is_empty_text(node) or is_empty_span(node)): + break + j += 1 + if found_close: + # remove [ ... ] + curr.extract() + for k in range(i + 1, j + 1): + children[k].extract() + children = list(el.children) + continue + i += 1 + + +def remove_inline_empty_brackets(tag): + for node in list(tag.find_all(string=True)): + text = str(node) + # ( ) avec espaces ou retours ligne + new_text = re.sub(r"\(\s*\)", "", text) + # [ ] avec espaces ou spans déjà nettoyés + new_text = re.sub(r"\[\s*\]", "", new_text) + if new_text != text: + node.replace_with(new_text) + +def remove_split_empty_parentheses(tag): + for el in tag.find_all(True): + children = list(el.children) + i = 0 + while i < len(children): + node = children[i] + if isinstance(node, str) and "(" in node: + idx = node.rfind("(") + if node[idx:].strip() == "(": + j = i + 1 + middle = [] + while j < len(children): + nxt = children[j] + if isinstance(nxt, str) and not nxt.strip(): + middle.append(nxt) + j += 1 + continue + if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True): + middle.append(nxt) + j += 1 + continue + break + if j < len(children): + end = children[j] + if isinstance(end, str): + stripped = end.strip() + if stripped == ")": + new_text = node[:idx] + node.replace_with(new_text) + for m in middle: + m.extract() + end.extract() + children = list(el.children) + continue + i += 1 + # ====================== # CORE FUNCTIONS # ====================== @@ -165,12 +263,6 @@ def clean_html_file(input_path: Path, output_path: Path): continue a.decompose() - for el in content.find_all(string=True): - t = el.strip() - if t in {"(", ")"}: - if len(el.parent.get_text(strip=True)) <= 2: - el.extract() - # Remove MediaWiki show/hide links for el in content.select( ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder" @@ -182,8 +274,6 @@ def clean_html_file(input_path: Path, output_path: Path): "show/hide", "click expand", "expand to read", - "( )", - "[ ]" ]): el.extract() @@ -206,6 +296,10 @@ def clean_html_file(input_path: Path, output_path: Path): src = src.replace("../", "") img["src"] = src + remove_empty_bracket_groups(content) + remove_split_empty_parentheses(content) + remove_inline_empty_brackets(content) + # Output cleaned HTML cleaned_html = content.prettify() output_path.write_text(cleaned_html, encoding="utf-8")