cleanup parenthesis&brackets

This commit is contained in:
Maxime Réaux 2026-04-15 08:08:00 +02:00
parent c9fb3513ee
commit 6eeabd7c9d

View file

@ -1,4 +1,5 @@
import os import os
import re
from pathlib import Path from pathlib import Path
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
@ -87,6 +88,103 @@ def is_in_rules_clarifications(el):
title = prev.get_text(strip=True).lower() title = prev.get_text(strip=True).lower()
return "rules clarification" in title return "rules clarification" in title
def remove_empty_bracket_groups(tag):
for el in list(tag.find_all(True)):
children = list(el.children)
i = 0
while i < len(children) - 1:
curr = children[i]
nxt = children[i + 1]
def is_empty_text(n):
return isinstance(n, str) and not n.strip()
def is_char(n, chars):
return isinstance(n, str) and n.strip() in chars
def is_empty_span(n):
return (
getattr(n, "name", None) == "span"
and not n.get_text(strip=True)
)
# ( )
if (
is_char(curr, {"("})
and (
is_char(nxt, {")"})
or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))
)
):
curr.extract()
nxt.extract()
if i < len(children):
children = list(el.children)
continue
# [ ]
if is_char(curr, {"["}):
j = i + 1
found_close = False
while j < len(children):
node = children[j]
if is_char(node, {"]"}):
found_close = True
break
if not (is_empty_text(node) or is_empty_span(node)):
break
j += 1
if found_close:
# remove [ ... ]
curr.extract()
for k in range(i + 1, j + 1):
children[k].extract()
children = list(el.children)
continue
i += 1
def remove_inline_empty_brackets(tag):
for node in list(tag.find_all(string=True)):
text = str(node)
# ( ) avec espaces ou retours ligne
new_text = re.sub(r"\(\s*\)", "", text)
# [ ] avec espaces ou spans déjà nettoyés
new_text = re.sub(r"\[\s*\]", "", new_text)
if new_text != text:
node.replace_with(new_text)
def remove_split_empty_parentheses(tag):
for el in tag.find_all(True):
children = list(el.children)
i = 0
while i < len(children):
node = children[i]
if isinstance(node, str) and "(" in node:
idx = node.rfind("(")
if node[idx:].strip() == "(":
j = i + 1
middle = []
while j < len(children):
nxt = children[j]
if isinstance(nxt, str) and not nxt.strip():
middle.append(nxt)
j += 1
continue
if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):
middle.append(nxt)
j += 1
continue
break
if j < len(children):
end = children[j]
if isinstance(end, str):
stripped = end.strip()
if stripped == ")":
new_text = node[:idx]
node.replace_with(new_text)
for m in middle:
m.extract()
end.extract()
children = list(el.children)
continue
i += 1
# ====================== # ======================
# CORE FUNCTIONS # CORE FUNCTIONS
# ====================== # ======================
@ -165,12 +263,6 @@ def clean_html_file(input_path: Path, output_path: Path):
continue continue
a.decompose() a.decompose()
for el in content.find_all(string=True):
t = el.strip()
if t in {"(", ")"}:
if len(el.parent.get_text(strip=True)) <= 2:
el.extract()
# Remove MediaWiki show/hide links # Remove MediaWiki show/hide links
for el in content.select( for el in content.select(
".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder" ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
@ -182,8 +274,6 @@ def clean_html_file(input_path: Path, output_path: Path):
"show/hide", "show/hide",
"click expand", "click expand",
"expand to read", "expand to read",
"( )",
"[ ]"
]): ]):
el.extract() el.extract()
@ -206,6 +296,10 @@ def clean_html_file(input_path: Path, output_path: Path):
src = src.replace("../", "") src = src.replace("../", "")
img["src"] = src img["src"] = src
remove_empty_bracket_groups(content)
remove_split_empty_parentheses(content)
remove_inline_empty_brackets(content)
# Output cleaned HTML # Output cleaned HTML
cleaned_html = content.prettify() cleaned_html = content.prettify()
output_path.write_text(cleaned_html, encoding="utf-8") output_path.write_text(cleaned_html, encoding="utf-8")