diff --git a/extract_content.py b/extract_content.py new file mode 100644 index 0000000..3c6e8f0 --- /dev/null +++ b/extract_content.py @@ -0,0 +1,225 @@ +import os +from pathlib import Path +from bs4 import BeautifulSoup, Comment + +# ====================== +# CONFIG +# ====================== + +SOURCE_DIR = Path("../output/pages") +OUTPUT_DIR = Path("../output/cleaned_pages") + +REMOVE_SELECTORS = [ + "#toc", + ".toc", + ".mw-editsection", + ".reference", + ".noprint", + ".mw-jump", + "#catlinks", + ".printfooter", + ".mw-category", + ".mw-normal-catlinks", + ".mw-category-generated", + "#mw-subcategories", + "#mw-pages" +] + +REMOVE_TAGS = [ + "script", + "style", +] + +SECTION_TITLES_TO_REMOVE = [ + "see also", + "subcategories", + "categories", +] + +REMOVE_CLASSES_CONTAINS = [ + "navbox", + "vertical-navbox", + "mw-collapsible", +] + +STRIP_ATTRIBUTES = [ + "style", + "class", + "width", + "height", + "border", +] + +# ====================== +# HELPERS +# ====================== + +def safe_elements(elements): + for el in list(elements): + if getattr(el, "attrs", None): + yield el + +def is_unwanted_section(title: str) -> bool: + title = title.lower() + return any(t in title for t in SECTION_TITLES_TO_REMOVE) + +def remove_section(header): + el = header + while True: + next_el = el.find_next_sibling() + el.decompose() + if not next_el or next_el.name in ["h1", "h2", "h3"]: + break + el = next_el + +def is_bad_collapsible(el): + classes = el.get("class", []) + return ( + "mw-collapsible" in classes + and "navbox" in classes # ou autre critère + ) + + +def is_in_rules_clarifications(el): + prev = el.find_previous(["h1", "h2", "h3"]) + if not prev: + return False + title = prev.get_text(strip=True).lower() + return "rules clarification" in title + +# ====================== +# CORE FUNCTIONS +# ====================== + +def clean_html_file(input_path: Path, output_path: Path): + html = input_path.read_text(encoding="utf-8", errors="ignore") + soup = BeautifulSoup(html, "html.parser") + + # Remove comments (HTTrack etc.) + for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): + comment.extract() + + # Remove scripts/styles + for tag in REMOVE_TAGS: + for el in soup.find_all(tag): + el.decompose() + + + # Extract main content + content = soup.select_one("#mw-content-text") + if not content: + print(f"[WARN] No content in {input_path.name}") + return + + # Detect category pages + title = soup.title.string if soup.title else "" + if title and "Category:" in title: + for el in content.select(".mw-category-generated"): + el.decompose() + + # Remove unwanted blocks + for selector in REMOVE_SELECTORS: + for el in content.select(selector): + el.decompose() + + # Remove unwanted sections by title (See also, Categories, Subcategories) + for header in content.find_all(["h1", "h2", "h3"]): + title = header.get_text(strip=True).lower() + if is_unwanted_section(title): + remove_section(header) + + # Remove unwanted class-based elements + for el in safe_elements(content.find_all(True)): + classes = el.attrs.get("class", []) + if "mw-collapsible" in classes: + if not is_in_rules_clarifications(el): + el.decompose() + continue + if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"): + el.decompose() + + # Remove hidden rows (display:none) + for el in safe_elements(content.find_all(True)): + style = el.get("style", "") + if style and "display: none" in style: + el.decompose() + + # Strip attributes + for el in safe_elements(content.find_all(True)): + for attr in STRIP_ATTRIBUTES: + el.attrs.pop(attr, None) + + # Remove MediaWiki edit links & sections + for el in content.select(".mw-editsection"): + el.decompose() + for a in content.find_all("a"): + href = a.get("href", "") + text = a.get_text(strip=True).lower() + if "action=edit" not in href and text != "edit": + continue + parent = a.parent + if parent: + parent_text = parent.get_text(strip=True) + if parent_text.replace(" ", "") in {"(edit)", "(edit)"}: + parent.decompose() + continue + a.decompose() + + for el in content.find_all(string=True): + t = el.strip() + if t in {"(", ")"}: + if len(el.parent.get_text(strip=True)) <= 2: + el.extract() + + # Remove MediaWiki show/hide links + for el in content.select( + ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder" + ): + el.decompose() + for el in content.find_all(string=True): + t = el.strip().lower() + if any(x in t for x in [ + "show/hide", + "click expand", + "expand to read", + "( )", + "[ ]" + ]): + el.extract() + + # Fix links (important for next steps) + for a in list(content.find_all("a")): + href = a.attrs.get("href") + if not href: + continue + href = href.replace("../", "") + href = href.replace("index.php?title=", "") + a["href"] = href + + # Simplify image links + for a in list(content.find_all("a", class_="image")): + img = a.find("img") + if img: + a.replace_with(img) + for img in content.find_all("img"): + src = img.get("src", "") + src = src.replace("../", "") + img["src"] = src + + # Output cleaned HTML + cleaned_html = content.prettify() + output_path.write_text(cleaned_html, encoding="utf-8") + + +def process_all(): + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + + for file in SOURCE_DIR.glob("*.html"): + output_file = OUTPUT_DIR / file.name + clean_html_file(file, output_file) + + print("✅ Cleaning complete") + + +if __name__ == "__main__": + process_all() \ No newline at end of file diff --git a/extract_categories.py b/find_categories.py similarity index 100% rename from extract_categories.py rename to find_categories.py diff --git a/prepare_pages_and_registry.py b/prepare_pages_and_registry.py index 222f4d7..f6517c6 100644 --- a/prepare_pages_and_registry.py +++ b/prepare_pages_and_registry.py @@ -9,7 +9,7 @@ from difflib import SequenceMatcher from bs4 import BeautifulSoup import unicodedata -SOURCE_DIR = Path("../original_index") +SOURCE_DIR = Path("../test") OUTPUT_DIR = Path("../output") PAGES_DIR = Path(OUTPUT_DIR / "pages")