extract page content WIP

2026-04-14 15:15:34 +02:00 · 2026-04-14 15:15:34 +02:00 · c9fb3513ee
commit c9fb3513ee
parent 0aace3dfc8
3 changed files with 226 additions and 1 deletions
--- a/extract_content.py
+++ b/extract_content.py
@ -0,0 +1,225 @@
 import os
 from pathlib import Path
 from bs4 import BeautifulSoup, Comment
 # ======================
 # CONFIG
 # ======================
 SOURCE_DIR = Path("../output/pages")
 OUTPUT_DIR = Path("../output/cleaned_pages")
 REMOVE_SELECTORS = [
    "#toc",
    ".toc",
    ".mw-editsection",
    ".reference",
    ".noprint",
    ".mw-jump",
    "#catlinks",
    ".printfooter",
    ".mw-category",
    ".mw-normal-catlinks",
    ".mw-category-generated", 
    "#mw-subcategories", 
    "#mw-pages"
 ]
 REMOVE_TAGS = [
    "script",
    "style",
 ]
 SECTION_TITLES_TO_REMOVE = [
    "see also",
    "subcategories",
    "categories",
 ]
 REMOVE_CLASSES_CONTAINS = [
    "navbox",
    "vertical-navbox",
    "mw-collapsible",
 ]
 STRIP_ATTRIBUTES = [
    "style",
    "class",
    "width",
    "height",
    "border",
 ]
 # ======================
 # HELPERS
 # ======================
 def safe_elements(elements):
    for el in list(elements):
        if getattr(el, "attrs", None):
            yield el
 def is_unwanted_section(title: str) -> bool:
    title = title.lower()
    return any(t in title for t in SECTION_TITLES_TO_REMOVE)
 def remove_section(header):
    el = header
    while True:
        next_el = el.find_next_sibling()
        el.decompose()
        if not next_el or next_el.name in ["h1", "h2", "h3"]:
            break
        el = next_el
 def is_bad_collapsible(el):
    classes = el.get("class", [])
    return (
        "mw-collapsible" in classes
        and "navbox" in classes  # ou autre critère
    )
 def is_in_rules_clarifications(el):
    prev = el.find_previous(["h1", "h2", "h3"])
    if not prev:
        return False
    title = prev.get_text(strip=True).lower()
    return "rules clarification" in title
 # ======================
 # CORE FUNCTIONS
 # ======================
 def clean_html_file(input_path: Path, output_path: Path):
    html = input_path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")
    # Remove comments (HTTrack etc.)
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    # Remove scripts/styles
    for tag in REMOVE_TAGS:
        for el in soup.find_all(tag):
            el.decompose()
    # Extract main content
    content = soup.select_one("#mw-content-text")
    if not content:
        print(f"[WARN] No content in {input_path.name}")
        return
    # Detect category pages
    title = soup.title.string if soup.title else ""
    if title and "Category:" in title:
        for el in content.select(".mw-category-generated"):
            el.decompose()
    # Remove unwanted blocks
    for selector in REMOVE_SELECTORS:
        for el in content.select(selector):
            el.decompose()
    # Remove unwanted sections by title (See also, Categories, Subcategories)
    for header in content.find_all(["h1", "h2", "h3"]):
        title = header.get_text(strip=True).lower()
        if is_unwanted_section(title):
            remove_section(header)
    # Remove unwanted class-based elements
    for el in safe_elements(content.find_all(True)):
        classes = el.attrs.get("class", [])
        if "mw-collapsible" in classes:
            if not is_in_rules_clarifications(el):
                el.decompose()
                continue
        if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"):
            el.decompose()
    # Remove hidden rows (display:none)
    for el in safe_elements(content.find_all(True)):
        style = el.get("style", "")
        if style and "display: none" in style:
            el.decompose()
    # Strip attributes
    for el in safe_elements(content.find_all(True)):
        for attr in STRIP_ATTRIBUTES:
            el.attrs.pop(attr, None)
    # Remove MediaWiki edit links & sections
    for el in content.select(".mw-editsection"):
        el.decompose()
    for a in content.find_all("a"):
        href = a.get("href", "")
        text = a.get_text(strip=True).lower()
        if "action=edit" not in href and text != "edit":
            continue
        parent = a.parent
        if parent:
            parent_text = parent.get_text(strip=True)
            if parent_text.replace(" ", "") in {"(edit)", "(edit)"}:
                parent.decompose()
                continue
        a.decompose()
    for el in content.find_all(string=True):
        t = el.strip()
        if t in {"(", ")"}:
            if len(el.parent.get_text(strip=True)) <= 2:
                el.extract()
    # Remove MediaWiki show/hide links
    for el in content.select(
        ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
    ):
        el.decompose()
    for el in content.find_all(string=True):
        t = el.strip().lower()
        if any(x in t for x in [
            "show/hide",
            "click expand",
            "expand to read",
            "( )",
            "[ ]"
        ]):
            el.extract()
    # Fix links (important for next steps)
    for a in list(content.find_all("a")):
        href = a.attrs.get("href")
        if not href:
            continue
        href = href.replace("../", "")
        href = href.replace("index.php?title=", "")
        a["href"] = href
    # Simplify image links
    for a in list(content.find_all("a", class_="image")):
        img = a.find("img")
        if img:
            a.replace_with(img)
    for img in content.find_all("img"):
        src = img.get("src", "")
        src = src.replace("../", "")
        img["src"] = src
    # Output cleaned HTML
    cleaned_html = content.prettify()
    output_path.write_text(cleaned_html, encoding="utf-8")
 def process_all():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    for file in SOURCE_DIR.glob("*.html"):
        output_file = OUTPUT_DIR / file.name
        clean_html_file(file, output_file)
    print("✅ Cleaning complete")
 if __name__ == "__main__":
    process_all()
--- a/extract_categories.py
+++ b/extract_categories.py
--- a/prepare_pages_and_registry.py
+++ b/prepare_pages_and_registry.py
@ -9,7 +9,7 @@ from difflib import SequenceMatcher
 from bs4 import BeautifulSoup
 import unicodedata
-SOURCE_DIR = Path("../original_index")
+SOURCE_DIR = Path("../test")
 OUTPUT_DIR = Path("../output")
 PAGES_DIR = Path(OUTPUT_DIR / "pages")