374 lines
No EOL
11 KiB
Python
374 lines
No EOL
11 KiB
Python
import os
|
|
import re
|
|
from pathlib import Path
|
|
from bs4 import BeautifulSoup, Comment
|
|
|
|
# ======================
|
|
# CONFIG
|
|
# ======================
|
|
|
|
SOURCE_DIR = Path("../output/pages")
|
|
OUTPUT_DIR = Path("../output/cleaned_pages")
|
|
|
|
REMOVE_SELECTORS = [
|
|
"#toc",
|
|
".toc",
|
|
".mw-editsection",
|
|
".reference",
|
|
".noprint",
|
|
".mw-jump",
|
|
"#catlinks",
|
|
".printfooter",
|
|
".mw-category",
|
|
".mw-normal-catlinks",
|
|
".mw-category-generated",
|
|
"#mw-subcategories",
|
|
"#mw-pages"
|
|
]
|
|
|
|
REMOVE_TAGS = [
|
|
"script",
|
|
"style",
|
|
]
|
|
|
|
SECTION_TITLES_TO_REMOVE = [
|
|
"see also",
|
|
"subcategories",
|
|
"categories",
|
|
]
|
|
|
|
REMOVE_CLASSES_CONTAINS = [
|
|
"navbox",
|
|
"vertical-navbox",
|
|
"mw-collapsible",
|
|
]
|
|
|
|
STRIP_ATTRIBUTES = [
|
|
"style",
|
|
"class",
|
|
"width",
|
|
"height",
|
|
"border",
|
|
]
|
|
|
|
# ======================
|
|
# HELPERS
|
|
# ======================
|
|
|
|
def safe_elements(elements):
|
|
for el in list(elements):
|
|
if getattr(el, "attrs", None):
|
|
yield el
|
|
|
|
def is_unwanted_section(title: str) -> bool:
|
|
title = title.lower()
|
|
return any(t in title for t in SECTION_TITLES_TO_REMOVE)
|
|
|
|
def remove_section(header):
|
|
el = header
|
|
while True:
|
|
next_el = el.find_next_sibling()
|
|
el.decompose()
|
|
if not next_el or next_el.name in ["h1", "h2", "h3"]:
|
|
break
|
|
el = next_el
|
|
|
|
def is_bad_collapsible(el):
|
|
classes = el.get("class", [])
|
|
return (
|
|
"mw-collapsible" in classes
|
|
and "navbox" in classes # ou autre critère
|
|
)
|
|
|
|
|
|
def is_in_rules_clarifications(el):
|
|
prev = el.find_previous(["h1", "h2", "h3"])
|
|
if not prev:
|
|
return False
|
|
title = prev.get_text(strip=True).lower()
|
|
return "rules clarification" in title
|
|
|
|
def remove_empty_bracket_groups(tag):
|
|
for el in list(tag.find_all(True)):
|
|
children = list(el.children)
|
|
i = 0
|
|
while i < len(children) - 1:
|
|
curr = children[i]
|
|
nxt = children[i + 1]
|
|
def is_empty_text(n):
|
|
return isinstance(n, str) and not n.strip()
|
|
def is_char(n, chars):
|
|
return isinstance(n, str) and n.strip() in chars
|
|
def is_empty_span(n):
|
|
return (
|
|
getattr(n, "name", None) == "span"
|
|
and not n.get_text(strip=True)
|
|
)
|
|
# ( )
|
|
if (
|
|
is_char(curr, {"("})
|
|
and (
|
|
is_char(nxt, {")"})
|
|
or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))
|
|
)
|
|
):
|
|
curr.extract()
|
|
nxt.extract()
|
|
if i < len(children):
|
|
children = list(el.children)
|
|
continue
|
|
# [ ]
|
|
if is_char(curr, {"["}):
|
|
j = i + 1
|
|
found_close = False
|
|
while j < len(children):
|
|
node = children[j]
|
|
if is_char(node, {"]"}):
|
|
found_close = True
|
|
break
|
|
if not (is_empty_text(node) or is_empty_span(node)):
|
|
break
|
|
j += 1
|
|
if found_close:
|
|
# remove [ ... ]
|
|
curr.extract()
|
|
for k in range(i + 1, j + 1):
|
|
children[k].extract()
|
|
children = list(el.children)
|
|
continue
|
|
i += 1
|
|
|
|
|
|
def remove_inline_empty_brackets(tag):
|
|
for node in list(tag.find_all(string=True)):
|
|
text = str(node)
|
|
# ( ) avec espaces ou retours ligne
|
|
new_text = re.sub(r"\(\s*\)", "", text)
|
|
# [ ] avec espaces ou spans déjà nettoyés
|
|
new_text = re.sub(r"\[\s*\]", "", new_text)
|
|
if new_text != text:
|
|
node.replace_with(new_text)
|
|
|
|
def remove_split_empty_parentheses(tag):
|
|
for el in tag.find_all(True):
|
|
children = list(el.children)
|
|
i = 0
|
|
while i < len(children):
|
|
node = children[i]
|
|
if isinstance(node, str) and "(" in node:
|
|
idx = node.rfind("(")
|
|
if node[idx:].strip() == "(":
|
|
j = i + 1
|
|
middle = []
|
|
while j < len(children):
|
|
nxt = children[j]
|
|
if isinstance(nxt, str) and not nxt.strip():
|
|
middle.append(nxt)
|
|
j += 1
|
|
continue
|
|
if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):
|
|
middle.append(nxt)
|
|
j += 1
|
|
continue
|
|
break
|
|
if j < len(children):
|
|
end = children[j]
|
|
if isinstance(end, str):
|
|
stripped = end.strip()
|
|
if stripped == ")":
|
|
new_text = node[:idx]
|
|
node.replace_with(new_text)
|
|
for m in middle:
|
|
m.extract()
|
|
end.extract()
|
|
children = list(el.children)
|
|
continue
|
|
i += 1
|
|
|
|
def remove_orphan_headers(tag):
|
|
levels = {"h1": 1, "h2": 2, "h3": 3, "h4": 4}
|
|
for header in list(tag.find_all(["h1", "h2", "h3", "h4"])):
|
|
level = levels[header.name]
|
|
nxt = header.find_next_sibling()
|
|
while nxt and (
|
|
(isinstance(nxt, str) and not nxt.strip())
|
|
or getattr(nxt, "name", None) == "br"
|
|
):
|
|
nxt = nxt.find_next_sibling()
|
|
if not nxt:
|
|
header.decompose()
|
|
continue
|
|
if nxt.name in levels and levels[nxt.name] > level:
|
|
continue
|
|
if nxt.name in levels and levels[nxt.name] <= level:
|
|
header.decompose()
|
|
continue
|
|
if nxt.get_text(strip=True):
|
|
continue
|
|
|
|
def is_intro_rule_table(table):
|
|
text = table.get_text(" ", strip=True).lower()
|
|
has_structure_signal = (
|
|
table.name == "table"
|
|
and (
|
|
"wikitable" in (table.get("class") or [])
|
|
or table.get("width") == "100%"
|
|
)
|
|
)
|
|
has_intro_signal = any(k in text for k in [
|
|
"unlimited only",
|
|
"prime",
|
|
"legacy army",
|
|
"mark iii",
|
|
"mk4 icon",
|
|
"rest of this page"
|
|
])
|
|
return has_structure_signal and has_intro_signal
|
|
|
|
def remove_intro_rule_box(content):
|
|
for el in list(content.children):
|
|
if isinstance(el, str) and not el.strip():
|
|
continue
|
|
if getattr(el, "name", None) not in ["table", "div", "p"]:
|
|
break
|
|
if getattr(el, "name", None) == "table":
|
|
if is_intro_rule_table(el):
|
|
el.decompose()
|
|
continue
|
|
if getattr(el, "name", None) == "p":
|
|
break
|
|
|
|
# ======================
|
|
# CORE FUNCTIONS
|
|
# ======================
|
|
|
|
def clean_html_file(input_path: Path, output_path: Path):
|
|
html = input_path.read_text(encoding="utf-8", errors="ignore")
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Remove comments (HTTrack etc.)
|
|
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
comment.extract()
|
|
|
|
# Remove scripts/styles
|
|
for tag in REMOVE_TAGS:
|
|
for el in soup.find_all(tag):
|
|
el.decompose()
|
|
|
|
|
|
# Extract main content
|
|
content = soup.select_one("#mw-content-text")
|
|
if not content:
|
|
print(f"[WARN] No content in {input_path.name}")
|
|
return
|
|
remove_intro_rule_box(content)
|
|
|
|
# Detect category pages
|
|
title = soup.title.string if soup.title else ""
|
|
if title and "Category:" in title:
|
|
for el in content.select(".mw-category-generated"):
|
|
el.decompose()
|
|
|
|
# Remove unwanted blocks
|
|
for selector in REMOVE_SELECTORS:
|
|
for el in content.select(selector):
|
|
el.decompose()
|
|
|
|
# Remove unwanted sections by title (See also, Categories, Subcategories)
|
|
for header in content.find_all(["h1", "h2", "h3"]):
|
|
title = header.get_text(strip=True).lower()
|
|
if is_unwanted_section(title):
|
|
remove_section(header)
|
|
|
|
# Remove unwanted class-based elements
|
|
for el in safe_elements(content.find_all(True)):
|
|
classes = el.attrs.get("class", [])
|
|
if "mw-collapsible" in classes:
|
|
if not is_in_rules_clarifications(el):
|
|
el.decompose()
|
|
continue
|
|
if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"):
|
|
el.decompose()
|
|
|
|
# Remove hidden rows (display:none)
|
|
for el in safe_elements(content.find_all(True)):
|
|
style = el.get("style", "")
|
|
if style and "display: none" in style:
|
|
el.decompose()
|
|
|
|
# Strip attributes
|
|
for el in safe_elements(content.find_all(True)):
|
|
for attr in STRIP_ATTRIBUTES:
|
|
el.attrs.pop(attr, None)
|
|
|
|
# Remove MediaWiki edit links & sections
|
|
for el in content.select(".mw-editsection"):
|
|
el.decompose()
|
|
for a in content.find_all("a"):
|
|
href = a.get("href", "")
|
|
text = a.get_text(strip=True).lower()
|
|
if "action=edit" not in href and text != "edit":
|
|
continue
|
|
parent = a.parent
|
|
if parent:
|
|
parent_text = parent.get_text(strip=True)
|
|
if parent_text.replace(" ", "") in {"(edit)", "(edit)"}:
|
|
parent.decompose()
|
|
continue
|
|
a.decompose()
|
|
|
|
# Remove MediaWiki show/hide links
|
|
for el in content.select(
|
|
".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
|
|
):
|
|
el.decompose()
|
|
for el in content.find_all(string=True):
|
|
t = el.strip().lower()
|
|
if any(x in t for x in [
|
|
"show/hide",
|
|
"click expand",
|
|
"expand to read",
|
|
]):
|
|
el.extract()
|
|
|
|
# Fix links (important for next steps)
|
|
for a in list(content.find_all("a")):
|
|
href = a.attrs.get("href")
|
|
if not href:
|
|
continue
|
|
href = href.replace("../", "")
|
|
href = href.replace("index.php?title=", "")
|
|
a["href"] = href
|
|
|
|
# Simplify image links
|
|
for a in list(content.find_all("a", class_="image")):
|
|
img = a.find("img")
|
|
if img:
|
|
a.replace_with(img)
|
|
for img in content.find_all("img"):
|
|
src = img.get("src", "")
|
|
src = src.replace("../", "")
|
|
img["src"] = src
|
|
|
|
remove_empty_bracket_groups(content)
|
|
remove_split_empty_parentheses(content)
|
|
remove_inline_empty_brackets(content)
|
|
remove_orphan_headers(content)
|
|
|
|
# Output cleaned HTML
|
|
cleaned_html = content.prettify()
|
|
output_path.write_text(cleaned_html, encoding="utf-8")
|
|
|
|
|
|
def process_all():
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
for file in SOURCE_DIR.glob("*.html"):
|
|
output_file = OUTPUT_DIR / file.name
|
|
clean_html_file(file, output_file)
|
|
|
|
print("✅ Cleaning complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
process_all() |