extract page content WIP

2026-04-14 15:15:34 +02:00 · 2026-04-14 15:15:34 +02:00 · c9fb3513ee
commit c9fb3513ee
parent 0aace3dfc8
3 changed files with 226 additions and 1 deletions
--- a/extract_content.py
+++ b/extract_content.py
@ -0,0 +1,225 @@
+import os
+from pathlib import Path
+from bs4 import BeautifulSoup, Comment
+
+# ======================
+# CONFIG
+# ======================
+
+SOURCE_DIR = Path("../output/pages")
+OUTPUT_DIR = Path("../output/cleaned_pages")
+
+REMOVE_SELECTORS = [
+    "#toc",
+    ".toc",
+    ".mw-editsection",
+    ".reference",
+    ".noprint",
+    ".mw-jump",
+    "#catlinks",
+    ".printfooter",
+    ".mw-category",
+    ".mw-normal-catlinks",
+    ".mw-category-generated", 
+    "#mw-subcategories", 
+    "#mw-pages"
+]
+
+REMOVE_TAGS = [
+    "script",
+    "style",
+]
+
+SECTION_TITLES_TO_REMOVE = [
+    "see also",
+    "subcategories",
+    "categories",
+]
+
+REMOVE_CLASSES_CONTAINS = [
+    "navbox",
+    "vertical-navbox",
+    "mw-collapsible",
+]
+
+STRIP_ATTRIBUTES = [
+    "style",
+    "class",
+    "width",
+    "height",
+    "border",
+]
+
+# ======================
+# HELPERS
+# ======================
+
+def safe_elements(elements):
+    for el in list(elements):
+        if getattr(el, "attrs", None):
+            yield el
+
+def is_unwanted_section(title: str) -> bool:
+    title = title.lower()
+    return any(t in title for t in SECTION_TITLES_TO_REMOVE)
+
+def remove_section(header):
+    el = header
+    while True:
+        next_el = el.find_next_sibling()
+        el.decompose()
+        if not next_el or next_el.name in ["h1", "h2", "h3"]:
+            break
+        el = next_el
+
+def is_bad_collapsible(el):
+    classes = el.get("class", [])
+    return (
+        "mw-collapsible" in classes
+        and "navbox" in classes  # ou autre critère
+    )
+
+
+def is_in_rules_clarifications(el):
+    prev = el.find_previous(["h1", "h2", "h3"])
+    if not prev:
+        return False
+    title = prev.get_text(strip=True).lower()
+    return "rules clarification" in title
+
+# ======================
+# CORE FUNCTIONS
+# ======================
+
+def clean_html_file(input_path: Path, output_path: Path):
+    html = input_path.read_text(encoding="utf-8", errors="ignore")
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Remove comments (HTTrack etc.)
+    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+        comment.extract()
+
+    # Remove scripts/styles
+    for tag in REMOVE_TAGS:
+        for el in soup.find_all(tag):
+            el.decompose()
+
+
+    # Extract main content
+    content = soup.select_one("#mw-content-text")
+    if not content:
+        print(f"[WARN] No content in {input_path.name}")
+        return
+
+    # Detect category pages
+    title = soup.title.string if soup.title else ""
+    if title and "Category:" in title:
+        for el in content.select(".mw-category-generated"):
+            el.decompose()
+
+    # Remove unwanted blocks
+    for selector in REMOVE_SELECTORS:
+        for el in content.select(selector):
+            el.decompose()
+
+    # Remove unwanted sections by title (See also, Categories, Subcategories)
+    for header in content.find_all(["h1", "h2", "h3"]):
+        title = header.get_text(strip=True).lower()
+        if is_unwanted_section(title):
+            remove_section(header)
+
+    # Remove unwanted class-based elements
+    for el in safe_elements(content.find_all(True)):
+        classes = el.attrs.get("class", [])
+        if "mw-collapsible" in classes:
+            if not is_in_rules_clarifications(el):
+                el.decompose()
+                continue
+        if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"):
+            el.decompose()
+
+    # Remove hidden rows (display:none)
+    for el in safe_elements(content.find_all(True)):
+        style = el.get("style", "")
+        if style and "display: none" in style:
+            el.decompose()
+
+    # Strip attributes
+    for el in safe_elements(content.find_all(True)):
+        for attr in STRIP_ATTRIBUTES:
+            el.attrs.pop(attr, None)
+
+    # Remove MediaWiki edit links & sections
+    for el in content.select(".mw-editsection"):
+        el.decompose()
+    for a in content.find_all("a"):
+        href = a.get("href", "")
+        text = a.get_text(strip=True).lower()
+        if "action=edit" not in href and text != "edit":
+            continue
+        parent = a.parent
+        if parent:
+            parent_text = parent.get_text(strip=True)
+            if parent_text.replace(" ", "") in {"(edit)", "(edit)"}:
+                parent.decompose()
+                continue
+        a.decompose()
+
+    for el in content.find_all(string=True):
+        t = el.strip()
+        if t in {"(", ")"}:
+            if len(el.parent.get_text(strip=True)) <= 2:
+                el.extract()
+
+    # Remove MediaWiki show/hide links
+    for el in content.select(
+        ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
+    ):
+        el.decompose()
+    for el in content.find_all(string=True):
+        t = el.strip().lower()
+        if any(x in t for x in [
+            "show/hide",
+            "click expand",
+            "expand to read",
+            "( )",
+            "[ ]"
+        ]):
+            el.extract()
+
+    # Fix links (important for next steps)
+    for a in list(content.find_all("a")):
+        href = a.attrs.get("href")
+        if not href:
+            continue
+        href = href.replace("../", "")
+        href = href.replace("index.php?title=", "")
+        a["href"] = href
+
+    # Simplify image links
+    for a in list(content.find_all("a", class_="image")):
+        img = a.find("img")
+        if img:
+            a.replace_with(img)
+    for img in content.find_all("img"):
+        src = img.get("src", "")
+        src = src.replace("../", "")
+        img["src"] = src
+
+    # Output cleaned HTML
+    cleaned_html = content.prettify()
+    output_path.write_text(cleaned_html, encoding="utf-8")
+
+
+def process_all():
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    for file in SOURCE_DIR.glob("*.html"):
+        output_file = OUTPUT_DIR / file.name
+        clean_html_file(file, output_file)
+
+    print("✅ Cleaning complete")
+
+
+if __name__ == "__main__":
+    process_all()