cleanup parenthesis&brackets
This commit is contained in:
parent
c9fb3513ee
commit
6eeabd7c9d
1 changed files with 102 additions and 8 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
|
|
@ -87,6 +88,103 @@ def is_in_rules_clarifications(el):
|
|||
title = prev.get_text(strip=True).lower()
|
||||
return "rules clarification" in title
|
||||
|
||||
def remove_empty_bracket_groups(tag):
|
||||
for el in list(tag.find_all(True)):
|
||||
children = list(el.children)
|
||||
i = 0
|
||||
while i < len(children) - 1:
|
||||
curr = children[i]
|
||||
nxt = children[i + 1]
|
||||
def is_empty_text(n):
|
||||
return isinstance(n, str) and not n.strip()
|
||||
def is_char(n, chars):
|
||||
return isinstance(n, str) and n.strip() in chars
|
||||
def is_empty_span(n):
|
||||
return (
|
||||
getattr(n, "name", None) == "span"
|
||||
and not n.get_text(strip=True)
|
||||
)
|
||||
# ( )
|
||||
if (
|
||||
is_char(curr, {"("})
|
||||
and (
|
||||
is_char(nxt, {")"})
|
||||
or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))
|
||||
)
|
||||
):
|
||||
curr.extract()
|
||||
nxt.extract()
|
||||
if i < len(children):
|
||||
children = list(el.children)
|
||||
continue
|
||||
# [ ]
|
||||
if is_char(curr, {"["}):
|
||||
j = i + 1
|
||||
found_close = False
|
||||
while j < len(children):
|
||||
node = children[j]
|
||||
if is_char(node, {"]"}):
|
||||
found_close = True
|
||||
break
|
||||
if not (is_empty_text(node) or is_empty_span(node)):
|
||||
break
|
||||
j += 1
|
||||
if found_close:
|
||||
# remove [ ... ]
|
||||
curr.extract()
|
||||
for k in range(i + 1, j + 1):
|
||||
children[k].extract()
|
||||
children = list(el.children)
|
||||
continue
|
||||
i += 1
|
||||
|
||||
|
||||
def remove_inline_empty_brackets(tag):
|
||||
for node in list(tag.find_all(string=True)):
|
||||
text = str(node)
|
||||
# ( ) avec espaces ou retours ligne
|
||||
new_text = re.sub(r"\(\s*\)", "", text)
|
||||
# [ ] avec espaces ou spans déjà nettoyés
|
||||
new_text = re.sub(r"\[\s*\]", "", new_text)
|
||||
if new_text != text:
|
||||
node.replace_with(new_text)
|
||||
|
||||
def remove_split_empty_parentheses(tag):
|
||||
for el in tag.find_all(True):
|
||||
children = list(el.children)
|
||||
i = 0
|
||||
while i < len(children):
|
||||
node = children[i]
|
||||
if isinstance(node, str) and "(" in node:
|
||||
idx = node.rfind("(")
|
||||
if node[idx:].strip() == "(":
|
||||
j = i + 1
|
||||
middle = []
|
||||
while j < len(children):
|
||||
nxt = children[j]
|
||||
if isinstance(nxt, str) and not nxt.strip():
|
||||
middle.append(nxt)
|
||||
j += 1
|
||||
continue
|
||||
if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):
|
||||
middle.append(nxt)
|
||||
j += 1
|
||||
continue
|
||||
break
|
||||
if j < len(children):
|
||||
end = children[j]
|
||||
if isinstance(end, str):
|
||||
stripped = end.strip()
|
||||
if stripped == ")":
|
||||
new_text = node[:idx]
|
||||
node.replace_with(new_text)
|
||||
for m in middle:
|
||||
m.extract()
|
||||
end.extract()
|
||||
children = list(el.children)
|
||||
continue
|
||||
i += 1
|
||||
|
||||
# ======================
|
||||
# CORE FUNCTIONS
|
||||
# ======================
|
||||
|
|
@ -165,12 +263,6 @@ def clean_html_file(input_path: Path, output_path: Path):
|
|||
continue
|
||||
a.decompose()
|
||||
|
||||
for el in content.find_all(string=True):
|
||||
t = el.strip()
|
||||
if t in {"(", ")"}:
|
||||
if len(el.parent.get_text(strip=True)) <= 2:
|
||||
el.extract()
|
||||
|
||||
# Remove MediaWiki show/hide links
|
||||
for el in content.select(
|
||||
".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
|
||||
|
|
@ -182,8 +274,6 @@ def clean_html_file(input_path: Path, output_path: Path):
|
|||
"show/hide",
|
||||
"click expand",
|
||||
"expand to read",
|
||||
"( )",
|
||||
"[ ]"
|
||||
]):
|
||||
el.extract()
|
||||
|
||||
|
|
@ -206,6 +296,10 @@ def clean_html_file(input_path: Path, output_path: Path):
|
|||
src = src.replace("../", "")
|
||||
img["src"] = src
|
||||
|
||||
remove_empty_bracket_groups(content)
|
||||
remove_split_empty_parentheses(content)
|
||||
remove_inline_empty_brackets(content)
|
||||
|
||||
# Output cleaned HTML
|
||||
cleaned_html = content.prettify()
|
||||
output_path.write_text(cleaned_html, encoding="utf-8")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue