whu_migration_scripts/extract_content.py

319 lines
9.4 KiB
Python
Raw Normal View History

2026-04-14 15:15:34 +02:00
import os
2026-04-15 08:08:00 +02:00
import re
2026-04-14 15:15:34 +02:00
from pathlib import Path
from bs4 import BeautifulSoup, Comment
# ======================
# CONFIG
# ======================
SOURCE_DIR = Path("../output/pages")
OUTPUT_DIR = Path("../output/cleaned_pages")
REMOVE_SELECTORS = [
"#toc",
".toc",
".mw-editsection",
".reference",
".noprint",
".mw-jump",
"#catlinks",
".printfooter",
".mw-category",
".mw-normal-catlinks",
".mw-category-generated",
"#mw-subcategories",
"#mw-pages"
]
REMOVE_TAGS = [
"script",
"style",
]
SECTION_TITLES_TO_REMOVE = [
"see also",
"subcategories",
"categories",
]
REMOVE_CLASSES_CONTAINS = [
"navbox",
"vertical-navbox",
"mw-collapsible",
]
STRIP_ATTRIBUTES = [
"style",
"class",
"width",
"height",
"border",
]
# ======================
# HELPERS
# ======================
def safe_elements(elements):
for el in list(elements):
if getattr(el, "attrs", None):
yield el
def is_unwanted_section(title: str) -> bool:
title = title.lower()
return any(t in title for t in SECTION_TITLES_TO_REMOVE)
def remove_section(header):
el = header
while True:
next_el = el.find_next_sibling()
el.decompose()
if not next_el or next_el.name in ["h1", "h2", "h3"]:
break
el = next_el
def is_bad_collapsible(el):
classes = el.get("class", [])
return (
"mw-collapsible" in classes
and "navbox" in classes # ou autre critère
)
def is_in_rules_clarifications(el):
prev = el.find_previous(["h1", "h2", "h3"])
if not prev:
return False
title = prev.get_text(strip=True).lower()
return "rules clarification" in title
2026-04-15 08:08:00 +02:00
def remove_empty_bracket_groups(tag):
for el in list(tag.find_all(True)):
children = list(el.children)
i = 0
while i < len(children) - 1:
curr = children[i]
nxt = children[i + 1]
def is_empty_text(n):
return isinstance(n, str) and not n.strip()
def is_char(n, chars):
return isinstance(n, str) and n.strip() in chars
def is_empty_span(n):
return (
getattr(n, "name", None) == "span"
and not n.get_text(strip=True)
)
# ( )
if (
is_char(curr, {"("})
and (
is_char(nxt, {")"})
or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))
)
):
curr.extract()
nxt.extract()
if i < len(children):
children = list(el.children)
continue
# [ ]
if is_char(curr, {"["}):
j = i + 1
found_close = False
while j < len(children):
node = children[j]
if is_char(node, {"]"}):
found_close = True
break
if not (is_empty_text(node) or is_empty_span(node)):
break
j += 1
if found_close:
# remove [ ... ]
curr.extract()
for k in range(i + 1, j + 1):
children[k].extract()
children = list(el.children)
continue
i += 1
def remove_inline_empty_brackets(tag):
for node in list(tag.find_all(string=True)):
text = str(node)
# ( ) avec espaces ou retours ligne
new_text = re.sub(r"\(\s*\)", "", text)
# [ ] avec espaces ou spans déjà nettoyés
new_text = re.sub(r"\[\s*\]", "", new_text)
if new_text != text:
node.replace_with(new_text)
def remove_split_empty_parentheses(tag):
for el in tag.find_all(True):
children = list(el.children)
i = 0
while i < len(children):
node = children[i]
if isinstance(node, str) and "(" in node:
idx = node.rfind("(")
if node[idx:].strip() == "(":
j = i + 1
middle = []
while j < len(children):
nxt = children[j]
if isinstance(nxt, str) and not nxt.strip():
middle.append(nxt)
j += 1
continue
if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):
middle.append(nxt)
j += 1
continue
break
if j < len(children):
end = children[j]
if isinstance(end, str):
stripped = end.strip()
if stripped == ")":
new_text = node[:idx]
node.replace_with(new_text)
for m in middle:
m.extract()
end.extract()
children = list(el.children)
continue
i += 1
2026-04-14 15:15:34 +02:00
# ======================
# CORE FUNCTIONS
# ======================
def clean_html_file(input_path: Path, output_path: Path):
html = input_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "html.parser")
# Remove comments (HTTrack etc.)
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Remove scripts/styles
for tag in REMOVE_TAGS:
for el in soup.find_all(tag):
el.decompose()
# Extract main content
content = soup.select_one("#mw-content-text")
if not content:
print(f"[WARN] No content in {input_path.name}")
return
# Detect category pages
title = soup.title.string if soup.title else ""
if title and "Category:" in title:
for el in content.select(".mw-category-generated"):
el.decompose()
# Remove unwanted blocks
for selector in REMOVE_SELECTORS:
for el in content.select(selector):
el.decompose()
# Remove unwanted sections by title (See also, Categories, Subcategories)
for header in content.find_all(["h1", "h2", "h3"]):
title = header.get_text(strip=True).lower()
if is_unwanted_section(title):
remove_section(header)
# Remove unwanted class-based elements
for el in safe_elements(content.find_all(True)):
classes = el.attrs.get("class", [])
if "mw-collapsible" in classes:
if not is_in_rules_clarifications(el):
el.decompose()
continue
if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"):
el.decompose()
# Remove hidden rows (display:none)
for el in safe_elements(content.find_all(True)):
style = el.get("style", "")
if style and "display: none" in style:
el.decompose()
# Strip attributes
for el in safe_elements(content.find_all(True)):
for attr in STRIP_ATTRIBUTES:
el.attrs.pop(attr, None)
# Remove MediaWiki edit links & sections
for el in content.select(".mw-editsection"):
el.decompose()
for a in content.find_all("a"):
href = a.get("href", "")
text = a.get_text(strip=True).lower()
if "action=edit" not in href and text != "edit":
continue
parent = a.parent
if parent:
parent_text = parent.get_text(strip=True)
if parent_text.replace(" ", "") in {"(edit)", "(edit)"}:
parent.decompose()
continue
a.decompose()
# Remove MediaWiki show/hide links
for el in content.select(
".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
):
el.decompose()
for el in content.find_all(string=True):
t = el.strip().lower()
if any(x in t for x in [
"show/hide",
"click expand",
"expand to read",
]):
el.extract()
# Fix links (important for next steps)
for a in list(content.find_all("a")):
href = a.attrs.get("href")
if not href:
continue
href = href.replace("../", "")
href = href.replace("index.php?title=", "")
a["href"] = href
# Simplify image links
for a in list(content.find_all("a", class_="image")):
img = a.find("img")
if img:
a.replace_with(img)
for img in content.find_all("img"):
src = img.get("src", "")
src = src.replace("../", "")
img["src"] = src
2026-04-15 08:08:00 +02:00
remove_empty_bracket_groups(content)
remove_split_empty_parentheses(content)
remove_inline_empty_brackets(content)
2026-04-14 15:15:34 +02:00
# Output cleaned HTML
cleaned_html = content.prettify()
output_path.write_text(cleaned_html, encoding="utf-8")
def process_all():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
for file in SOURCE_DIR.glob("*.html"):
output_file = OUTPUT_DIR / file.name
clean_html_file(file, output_file)
print("✅ Cleaning complete")
if __name__ == "__main__":
process_all()