whu_migration_scripts/extract_content.py

import os
import re
from pathlib import Path
from bs4 import BeautifulSoup, Comment

# ======================
# CONFIG
# ======================

SOURCE_DIR = Path("../output/pages")
OUTPUT_DIR = Path("../output/cleaned_pages")

REMOVE_SELECTORS = [
    "#toc",
    ".toc",
    ".mw-editsection",
    ".reference",
    ".noprint",
    ".mw-jump",
    "#catlinks",
    ".printfooter",
    ".mw-category",
    ".mw-normal-catlinks",
    ".mw-category-generated", 
    "#mw-subcategories", 
    "#mw-pages"
]

REMOVE_TAGS = [
    "script",
    "style",
]

SECTION_TITLES_TO_REMOVE = [
    "see also",
    "subcategories",
    "categories",
]

REMOVE_CLASSES_CONTAINS = [
    "navbox",
    "vertical-navbox",
    "mw-collapsible",
]

STRIP_ATTRIBUTES = [
    "style",
    "class",
    "width",
    "height",
    "border",
]

# ======================
# HELPERS
# ======================

def safe_elements(elements):
    for el in list(elements):
        if getattr(el, "attrs", None):
            yield el

def is_unwanted_section(title: str) -> bool:
    title = title.lower()
    return any(t in title for t in SECTION_TITLES_TO_REMOVE)

def remove_section(header):
    el = header
    while True:
        next_el = el.find_next_sibling()
        el.decompose()
        if not next_el or next_el.name in ["h1", "h2", "h3"]:
            break
        el = next_el

def is_bad_collapsible(el):
    classes = el.get("class", [])
    return (
        "mw-collapsible" in classes
        and "navbox" in classes  # ou autre critère
    )


def is_in_rules_clarifications(el):
    prev = el.find_previous(["h1", "h2", "h3"])
    if not prev:
        return False
    title = prev.get_text(strip=True).lower()
    return "rules clarification" in title

def remove_empty_bracket_groups(tag):
    for el in list(tag.find_all(True)):
        children = list(el.children)
        i = 0
        while i < len(children) - 1:
            curr = children[i]
            nxt = children[i + 1]
            def is_empty_text(n):
                return isinstance(n, str) and not n.strip()
            def is_char(n, chars):
                return isinstance(n, str) and n.strip() in chars
            def is_empty_span(n):
                return (
                    getattr(n, "name", None) == "span"
                    and not n.get_text(strip=True)
                )
            # ( )
            if (
                is_char(curr, {"("})
                and (
                    is_char(nxt, {")"})
                    or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))
                )
            ):
                curr.extract()
                nxt.extract()
                if i < len(children):
                    children = list(el.children)
                continue
            #  [ ]
            if is_char(curr, {"["}):
                j = i + 1
                found_close = False
                while j < len(children):
                    node = children[j]
                    if is_char(node, {"]"}):
                        found_close = True
                        break
                    if not (is_empty_text(node) or is_empty_span(node)):
                        break
                    j += 1
                if found_close:
                    # remove [ ... ]
                    curr.extract()
                    for k in range(i + 1, j + 1):
                        children[k].extract()
                    children = list(el.children)
                    continue
            i += 1


def remove_inline_empty_brackets(tag):
    for node in list(tag.find_all(string=True)):
        text = str(node)
        # ( ) avec espaces ou retours ligne
        new_text = re.sub(r"\(\s*\)", "", text)
        # [ ] avec espaces ou spans déjà nettoyés
        new_text = re.sub(r"\[\s*\]", "", new_text)
        if new_text != text:
            node.replace_with(new_text)

def remove_split_empty_parentheses(tag):
    for el in tag.find_all(True):
        children = list(el.children)
        i = 0
        while i < len(children):
            node = children[i]
            if isinstance(node, str) and "(" in node:
                idx = node.rfind("(")
                if node[idx:].strip() == "(":
                    j = i + 1
                    middle = []
                    while j < len(children):
                        nxt = children[j]
                        if isinstance(nxt, str) and not nxt.strip():
                            middle.append(nxt)
                            j += 1
                            continue
                        if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):
                            middle.append(nxt)
                            j += 1
                            continue
                        break
                    if j < len(children):
                        end = children[j]
                        if isinstance(end, str):
                            stripped = end.strip()
                            if stripped == ")":
                                new_text = node[:idx]
                                node.replace_with(new_text)
                                for m in middle:
                                    m.extract()
                                end.extract()
                                children = list(el.children)
                                continue
            i += 1

# ======================
# CORE FUNCTIONS
# ======================

def clean_html_file(input_path: Path, output_path: Path):
    html = input_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    # Remove comments (HTTrack etc.)
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Remove scripts/styles
    for tag in REMOVE_TAGS:
        for el in soup.find_all(tag):
            el.decompose()


    # Extract main content
    content = soup.select_one("#mw-content-text")
    if not content:
        print(f"[WARN] No content in {input_path.name}")
        return

    # Detect category pages
    title = soup.title.string if soup.title else ""
    if title and "Category:" in title:
        for el in content.select(".mw-category-generated"):
            el.decompose()

    # Remove unwanted blocks
    for selector in REMOVE_SELECTORS:
        for el in content.select(selector):
            el.decompose()

    # Remove unwanted sections by title (See also, Categories, Subcategories)
    for header in content.find_all(["h1", "h2", "h3"]):
        title = header.get_text(strip=True).lower()
        if is_unwanted_section(title):
            remove_section(header)

    # Remove unwanted class-based elements
    for el in safe_elements(content.find_all(True)):
        classes = el.attrs.get("class", [])
        if "mw-collapsible" in classes:
            if not is_in_rules_clarifications(el):
                el.decompose()
                continue
        if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"):
            el.decompose()

    # Remove hidden rows (display:none)
    for el in safe_elements(content.find_all(True)):
        style = el.get("style", "")
        if style and "display: none" in style:
            el.decompose()

    # Strip attributes
    for el in safe_elements(content.find_all(True)):
        for attr in STRIP_ATTRIBUTES:
            el.attrs.pop(attr, None)

    # Remove MediaWiki edit links & sections
    for el in content.select(".mw-editsection"):
        el.decompose()
    for a in content.find_all("a"):
        href = a.get("href", "")
        text = a.get_text(strip=True).lower()
        if "action=edit" not in href and text != "edit":
            continue
        parent = a.parent
        if parent:
            parent_text = parent.get_text(strip=True)
            if parent_text.replace(" ", "") in {"(edit)", "(edit)"}:
                parent.decompose()
                continue
        a.decompose()

    # Remove MediaWiki show/hide links
    for el in content.select(
        ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
    ):
        el.decompose()
    for el in content.find_all(string=True):
        t = el.strip().lower()
        if any(x in t for x in [
            "show/hide",
            "click expand",
            "expand to read",
        ]):
            el.extract()

    # Fix links (important for next steps)
    for a in list(content.find_all("a")):
        href = a.attrs.get("href")
        if not href:
            continue
        href = href.replace("../", "")
        href = href.replace("index.php?title=", "")
        a["href"] = href

    # Simplify image links
    for a in list(content.find_all("a", class_="image")):
        img = a.find("img")
        if img:
            a.replace_with(img)
    for img in content.find_all("img"):
        src = img.get("src", "")
        src = src.replace("../", "")
        img["src"] = src

    remove_empty_bracket_groups(content)
    remove_split_empty_parentheses(content)
    remove_inline_empty_brackets(content)

    # Output cleaned HTML
    cleaned_html = content.prettify()
    output_path.write_text(cleaned_html, encoding="utf-8")


def process_all():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    for file in SOURCE_DIR.glob("*.html"):
        output_file = OUTPUT_DIR / file.name
        clean_html_file(file, output_file)

    print("✅ Cleaning complete")


if __name__ == "__main__":
    process_all()
extract page content WIP 2026-04-14 15:15:34 +02:00			`import os`
cleanup parenthesis&brackets 2026-04-15 08:08:00 +02:00			`import re`
extract page content WIP 2026-04-14 15:15:34 +02:00			`from pathlib import Path`
			`from bs4 import BeautifulSoup, Comment`

			`# ======================`
			`# CONFIG`
			`# ======================`

			`SOURCE_DIR = Path("../output/pages")`
			`OUTPUT_DIR = Path("../output/cleaned_pages")`

			`REMOVE_SELECTORS = [`
			`"#toc",`
			`".toc",`
			`".mw-editsection",`
			`".reference",`
			`".noprint",`
			`".mw-jump",`
			`"#catlinks",`
			`".printfooter",`
			`".mw-category",`
			`".mw-normal-catlinks",`
			`".mw-category-generated",`
			`"#mw-subcategories",`
			`"#mw-pages"`
			`]`

			`REMOVE_TAGS = [`
			`"script",`
			`"style",`
			`]`

			`SECTION_TITLES_TO_REMOVE = [`
			`"see also",`
			`"subcategories",`
			`"categories",`
			`]`

			`REMOVE_CLASSES_CONTAINS = [`
			`"navbox",`
			`"vertical-navbox",`
			`"mw-collapsible",`
			`]`

			`STRIP_ATTRIBUTES = [`
			`"style",`
			`"class",`
			`"width",`
			`"height",`
			`"border",`
			`]`

			`# ======================`
			`# HELPERS`
			`# ======================`

			`def safe_elements(elements):`
			`for el in list(elements):`
			`if getattr(el, "attrs", None):`
			`yield el`

			`def is_unwanted_section(title: str) -> bool:`
			`title = title.lower()`
			`return any(t in title for t in SECTION_TITLES_TO_REMOVE)`

			`def remove_section(header):`
			`el = header`
			`while True:`
			`next_el = el.find_next_sibling()`
			`el.decompose()`
			`if not next_el or next_el.name in ["h1", "h2", "h3"]:`
			`break`
			`el = next_el`

			`def is_bad_collapsible(el):`
			`classes = el.get("class", [])`
			`return (`
			`"mw-collapsible" in classes`
			`and "navbox" in classes # ou autre critère`
			`)`


			`def is_in_rules_clarifications(el):`
			`prev = el.find_previous(["h1", "h2", "h3"])`
			`if not prev:`
			`return False`
			`title = prev.get_text(strip=True).lower()`
			`return "rules clarification" in title`

cleanup parenthesis&brackets 2026-04-15 08:08:00 +02:00			`def remove_empty_bracket_groups(tag):`
			`for el in list(tag.find_all(True)):`
			`children = list(el.children)`
			`i = 0`
			`while i < len(children) - 1:`
			`curr = children[i]`
			`nxt = children[i + 1]`
			`def is_empty_text(n):`
			`return isinstance(n, str) and not n.strip()`
			`def is_char(n, chars):`
			`return isinstance(n, str) and n.strip() in chars`
			`def is_empty_span(n):`
			`return (`
			`getattr(n, "name", None) == "span"`
			`and not n.get_text(strip=True)`
			`)`
			`# ( )`
			`if (`
			`is_char(curr, {"("})`
			`and (`
			`is_char(nxt, {")"})`
			`or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"}))`
			`)`
			`):`
			`curr.extract()`
			`nxt.extract()`
			`if i < len(children):`
			`children = list(el.children)`
			`continue`
			`# [ ]`
			`if is_char(curr, {"["}):`
			`j = i + 1`
			`found_close = False`
			`while j < len(children):`
			`node = children[j]`
			`if is_char(node, {"]"}):`
			`found_close = True`
			`break`
			`if not (is_empty_text(node) or is_empty_span(node)):`
			`break`
			`j += 1`
			`if found_close:`
			`# remove [ ... ]`
			`curr.extract()`
			`for k in range(i + 1, j + 1):`
			`children[k].extract()`
			`children = list(el.children)`
			`continue`
			`i += 1`


			`def remove_inline_empty_brackets(tag):`
			`for node in list(tag.find_all(string=True)):`
			`text = str(node)`
			`# ( ) avec espaces ou retours ligne`
			`new_text = re.sub(r"\(\s*\)", "", text)`
			`# [ ] avec espaces ou spans déjà nettoyés`
			`new_text = re.sub(r"\[\s*\]", "", new_text)`
			`if new_text != text:`
			`node.replace_with(new_text)`

			`def remove_split_empty_parentheses(tag):`
			`for el in tag.find_all(True):`
			`children = list(el.children)`
			`i = 0`
			`while i < len(children):`
			`node = children[i]`
			`if isinstance(node, str) and "(" in node:`
			`idx = node.rfind("(")`
			`if node[idx:].strip() == "(":`
			`j = i + 1`
			`middle = []`
			`while j < len(children):`
			`nxt = children[j]`
			`if isinstance(nxt, str) and not nxt.strip():`
			`middle.append(nxt)`
			`j += 1`
			`continue`
			`if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True):`
			`middle.append(nxt)`
			`j += 1`
			`continue`
			`break`
			`if j < len(children):`
			`end = children[j]`
			`if isinstance(end, str):`
			`stripped = end.strip()`
			`if stripped == ")":`
			`new_text = node[:idx]`
			`node.replace_with(new_text)`
			`for m in middle:`
			`m.extract()`
			`end.extract()`
			`children = list(el.children)`
			`continue`
			`i += 1`

extract page content WIP 2026-04-14 15:15:34 +02:00			`# ======================`
			`# CORE FUNCTIONS`
			`# ======================`

			`def clean_html_file(input_path: Path, output_path: Path):`
			`html = input_path.read_text(encoding="utf-8", errors="ignore")`
			`soup = BeautifulSoup(html, "html.parser")`

			`# Remove comments (HTTrack etc.)`
			`for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):`
			`comment.extract()`

			`# Remove scripts/styles`
			`for tag in REMOVE_TAGS:`
			`for el in soup.find_all(tag):`
			`el.decompose()`


			`# Extract main content`
			`content = soup.select_one("#mw-content-text")`
			`if not content:`
			`print(f"[WARN] No content in {input_path.name}")`
			`return`

			`# Detect category pages`
			`title = soup.title.string if soup.title else ""`
			`if title and "Category:" in title:`
			`for el in content.select(".mw-category-generated"):`
			`el.decompose()`

			`# Remove unwanted blocks`
			`for selector in REMOVE_SELECTORS:`
			`for el in content.select(selector):`
			`el.decompose()`

			`# Remove unwanted sections by title (See also, Categories, Subcategories)`
			`for header in content.find_all(["h1", "h2", "h3"]):`
			`title = header.get_text(strip=True).lower()`
			`if is_unwanted_section(title):`
			`remove_section(header)`

			`# Remove unwanted class-based elements`
			`for el in safe_elements(content.find_all(True)):`
			`classes = el.attrs.get("class", [])`
			`if "mw-collapsible" in classes:`
			`if not is_in_rules_clarifications(el):`
			`el.decompose()`
			`continue`
			`if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"):`
			`el.decompose()`

			`# Remove hidden rows (display:none)`
			`for el in safe_elements(content.find_all(True)):`
			`style = el.get("style", "")`
			`if style and "display: none" in style:`
			`el.decompose()`

			`# Strip attributes`
			`for el in safe_elements(content.find_all(True)):`
			`for attr in STRIP_ATTRIBUTES:`
			`el.attrs.pop(attr, None)`

			`# Remove MediaWiki edit links & sections`
			`for el in content.select(".mw-editsection"):`
			`el.decompose()`
			`for a in content.find_all("a"):`
			`href = a.get("href", "")`
			`text = a.get_text(strip=True).lower()`
			`if "action=edit" not in href and text != "edit":`
			`continue`
			`parent = a.parent`
			`if parent:`
			`parent_text = parent.get_text(strip=True)`
			`if parent_text.replace(" ", "") in {"(edit)", "(edit)"}:`
			`parent.decompose()`
			`continue`
			`a.decompose()`

			`# Remove MediaWiki show/hide links`
			`for el in content.select(`
			`".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"`
			`):`
			`el.decompose()`
			`for el in content.find_all(string=True):`
			`t = el.strip().lower()`
			`if any(x in t for x in [`
			`"show/hide",`
			`"click expand",`
			`"expand to read",`
			`]):`
			`el.extract()`

			`# Fix links (important for next steps)`
			`for a in list(content.find_all("a")):`
			`href = a.attrs.get("href")`
			`if not href:`
			`continue`
			`href = href.replace("../", "")`
			`href = href.replace("index.php?title=", "")`
			`a["href"] = href`

			`# Simplify image links`
			`for a in list(content.find_all("a", class_="image")):`
			`img = a.find("img")`
			`if img:`
			`a.replace_with(img)`
			`for img in content.find_all("img"):`
			`src = img.get("src", "")`
			`src = src.replace("../", "")`
			`img["src"] = src`

cleanup parenthesis&brackets 2026-04-15 08:08:00 +02:00			`remove_empty_bracket_groups(content)`
			`remove_split_empty_parentheses(content)`
			`remove_inline_empty_brackets(content)`

extract page content WIP 2026-04-14 15:15:34 +02:00			`# Output cleaned HTML`
			`cleaned_html = content.prettify()`
			`output_path.write_text(cleaned_html, encoding="utf-8")`


			`def process_all():`
			`OUTPUT_DIR.mkdir(parents=True, exist_ok=True)`

			`for file in SOURCE_DIR.glob("*.html"):`
			`output_file = OUTPUT_DIR / file.name`
			`clean_html_file(file, output_file)`

			`print("✅ Cleaning complete")`


			`if __name__ == "__main__":`
			`process_all()`