cleanup parenthesis&brackets
This commit is contained in:
parent
c9fb3513ee
commit
6eeabd7c9d
1 changed files with 102 additions and 8 deletions
|
|
@ -1,4 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from bs4 import BeautifulSoup, Comment
|
from bs4 import BeautifulSoup, Comment
|
||||||
|
|
||||||
|
|
@ -87,6 +88,103 @@ def is_in_rules_clarifications(el):
|
||||||
title = prev.get_text(strip=True).lower()
|
title = prev.get_text(strip=True).lower()
|
||||||
return "rules clarification" in title
|
return "rules clarification" in title
|
||||||
|
|
||||||
|
def remove_empty_bracket_groups(tag):
|
||||||
|
for el in list(tag.find_all(True)):
|
||||||
|
children = list(el.children)
|
||||||
|
i = 0
|
||||||
|
while i < len(children) - 1:
|
||||||
|
curr = children[i]
|
||||||
|
nxt = children[i + 1]
|
||||||
|
def is_empty_text(n):
|
||||||
|
return isinstance(n, str) and not n.strip()
|
||||||
|
def is_char(n, chars):
|
||||||
|
return isinstance(n, str) and n.strip() in chars
|
||||||
|
def is_empty_span(n):
|
||||||
|
return (
|
||||||
|
getattr(n, "name", None) == "span"
|
||||||
|
and not n.get_text(strip=True)
|
||||||
|
)
|
||||||
|
# ( )
|
||||||
|
if (
|
||||||
|
is_char(curr, {"("})
|
||||||
|
and (
|
||||||
|
is_char(nxt, {")"})
|
||||||
|
or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))
|
||||||
|
)
|
||||||
|
):
|
||||||
|
curr.extract()
|
||||||
|
nxt.extract()
|
||||||
|
if i < len(children):
|
||||||
|
children = list(el.children)
|
||||||
|
continue
|
||||||
|
# [ ]
|
||||||
|
if is_char(curr, {"["}):
|
||||||
|
j = i + 1
|
||||||
|
found_close = False
|
||||||
|
while j < len(children):
|
||||||
|
node = children[j]
|
||||||
|
if is_char(node, {"]"}):
|
||||||
|
found_close = True
|
||||||
|
break
|
||||||
|
if not (is_empty_text(node) or is_empty_span(node)):
|
||||||
|
break
|
||||||
|
j += 1
|
||||||
|
if found_close:
|
||||||
|
# remove [ ... ]
|
||||||
|
curr.extract()
|
||||||
|
for k in range(i + 1, j + 1):
|
||||||
|
children[k].extract()
|
||||||
|
children = list(el.children)
|
||||||
|
continue
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
|
def remove_inline_empty_brackets(tag):
|
||||||
|
for node in list(tag.find_all(string=True)):
|
||||||
|
text = str(node)
|
||||||
|
# ( ) avec espaces ou retours ligne
|
||||||
|
new_text = re.sub(r"\(\s*\)", "", text)
|
||||||
|
# [ ] avec espaces ou spans déjà nettoyés
|
||||||
|
new_text = re.sub(r"\[\s*\]", "", new_text)
|
||||||
|
if new_text != text:
|
||||||
|
node.replace_with(new_text)
|
||||||
|
|
||||||
|
def remove_split_empty_parentheses(tag):
|
||||||
|
for el in tag.find_all(True):
|
||||||
|
children = list(el.children)
|
||||||
|
i = 0
|
||||||
|
while i < len(children):
|
||||||
|
node = children[i]
|
||||||
|
if isinstance(node, str) and "(" in node:
|
||||||
|
idx = node.rfind("(")
|
||||||
|
if node[idx:].strip() == "(":
|
||||||
|
j = i + 1
|
||||||
|
middle = []
|
||||||
|
while j < len(children):
|
||||||
|
nxt = children[j]
|
||||||
|
if isinstance(nxt, str) and not nxt.strip():
|
||||||
|
middle.append(nxt)
|
||||||
|
j += 1
|
||||||
|
continue
|
||||||
|
if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):
|
||||||
|
middle.append(nxt)
|
||||||
|
j += 1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
if j < len(children):
|
||||||
|
end = children[j]
|
||||||
|
if isinstance(end, str):
|
||||||
|
stripped = end.strip()
|
||||||
|
if stripped == ")":
|
||||||
|
new_text = node[:idx]
|
||||||
|
node.replace_with(new_text)
|
||||||
|
for m in middle:
|
||||||
|
m.extract()
|
||||||
|
end.extract()
|
||||||
|
children = list(el.children)
|
||||||
|
continue
|
||||||
|
i += 1
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
# CORE FUNCTIONS
|
# CORE FUNCTIONS
|
||||||
# ======================
|
# ======================
|
||||||
|
|
@ -165,12 +263,6 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
continue
|
continue
|
||||||
a.decompose()
|
a.decompose()
|
||||||
|
|
||||||
for el in content.find_all(string=True):
|
|
||||||
t = el.strip()
|
|
||||||
if t in {"(", ")"}:
|
|
||||||
if len(el.parent.get_text(strip=True)) <= 2:
|
|
||||||
el.extract()
|
|
||||||
|
|
||||||
# Remove MediaWiki show/hide links
|
# Remove MediaWiki show/hide links
|
||||||
for el in content.select(
|
for el in content.select(
|
||||||
".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
|
".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
|
||||||
|
|
@ -182,8 +274,6 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
"show/hide",
|
"show/hide",
|
||||||
"click expand",
|
"click expand",
|
||||||
"expand to read",
|
"expand to read",
|
||||||
"( )",
|
|
||||||
"[ ]"
|
|
||||||
]):
|
]):
|
||||||
el.extract()
|
el.extract()
|
||||||
|
|
||||||
|
|
@ -206,6 +296,10 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
src = src.replace("../", "")
|
src = src.replace("../", "")
|
||||||
img["src"] = src
|
img["src"] = src
|
||||||
|
|
||||||
|
remove_empty_bracket_groups(content)
|
||||||
|
remove_split_empty_parentheses(content)
|
||||||
|
remove_inline_empty_brackets(content)
|
||||||
|
|
||||||
# Output cleaned HTML
|
# Output cleaned HTML
|
||||||
cleaned_html = content.prettify()
|
cleaned_html = content.prettify()
|
||||||
output_path.write_text(cleaned_html, encoding="utf-8")
|
output_path.write_text(cleaned_html, encoding="utf-8")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue