import os import re from pathlib import Path from bs4 import BeautifulSoup, Comment # ====================== # CONFIG # ====================== SOURCE_DIR = Path("../output/pages") OUTPUT_DIR = Path("../output/cleaned_pages") REMOVE_SELECTORS = [ "#toc", ".toc", ".mw-editsection", ".reference", ".noprint", ".mw-jump", "#catlinks", ".printfooter", ".mw-category", ".mw-normal-catlinks", ".mw-category-generated", "#mw-subcategories", "#mw-pages" ] REMOVE_TAGS = [ "script", "style", ] SECTION_TITLES_TO_REMOVE = [ "see also", "subcategories", "categories", ] REMOVE_CLASSES_CONTAINS = [ "navbox", "vertical-navbox", "mw-collapsible", ] STRIP_ATTRIBUTES = [ "style", "class", "width", "height", "border", ] ERROR_PAGE_PATTERNS = [ # 5xx "503 service unavailable", "502 bad gateway", "500 internal server error", "504 gateway time", # 4xx "400 bad request", "401 unauthorized", "403 forbidden", "404 not found", "408 request time", "419 page expired", "429 too many requests", # génériques "temporarily busy", "server error", "internal error", "page not found", "request could not be satisfied", ] # ====================== # HELPERS # ====================== def safe_elements(elements): for el in list(elements): if getattr(el, "attrs", None): yield el def is_unwanted_section(title: str) -> bool: title = title.lower() return any(t in title for t in SECTION_TITLES_TO_REMOVE) def remove_section(header): el = header while True: next_el = el.find_next_sibling() el.decompose() if not next_el or next_el.name in ["h1", "h2", "h3"]: break el = next_el def is_bad_collapsible(el): classes = el.get("class", []) return ( "mw-collapsible" in classes and "navbox" in classes # ou autre critère ) def is_in_rules_clarifications(el): prev = el.find_previous(["h1", "h2", "h3"]) if not prev: return False title = prev.get_text(strip=True).lower() return "rules clarification" in title def remove_empty_bracket_groups(tag): for el in list(tag.find_all(True)): children = list(el.children) i = 0 while i < len(children) - 1: curr = children[i] nxt = children[i + 1] def is_empty_text(n): return isinstance(n, str) and not n.strip() def is_char(n, chars): return isinstance(n, str) and n.strip() in chars def is_empty_span(n): return ( getattr(n, "name", None) == "span" and not n.get_text(strip=True) ) # ( ) if ( is_char(curr, {"("}) and ( is_char(nxt, {")"}) or (is_empty_text(nxt) and i + 2 < len(children) and is_char(children[i + 2], {")"})) ) ): curr.extract() nxt.extract() if i < len(children): children = list(el.children) continue # [ ] if is_char(curr, {"["}): j = i + 1 found_close = False while j < len(children): node = children[j] if is_char(node, {"]"}): found_close = True break if not (is_empty_text(node) or is_empty_span(node)): break j += 1 if found_close: # remove [ ... ] curr.extract() for k in range(i + 1, j + 1): children[k].extract() children = list(el.children) continue i += 1 def remove_inline_empty_brackets(tag): for node in list(tag.find_all(string=True)): text = str(node) # ( ) avec espaces ou retours ligne new_text = re.sub(r"\(\s*\)", "", text) # [ ] avec espaces ou spans déjà nettoyés new_text = re.sub(r"\[\s*\]", "", new_text) if new_text != text: node.replace_with(new_text) def remove_split_empty_parentheses(tag): for el in tag.find_all(True): children = list(el.children) i = 0 while i < len(children): node = children[i] if isinstance(node, str) and "(" in node: idx = node.rfind("(") if node[idx:].strip() == "(": j = i + 1 middle = [] while j < len(children): nxt = children[j] if isinstance(nxt, str) and not nxt.strip(): middle.append(nxt) j += 1 continue if getattr(nxt, "name", None) == "span" and not nxt.get_text(strip=True): middle.append(nxt) j += 1 continue break if j < len(children): end = children[j] if isinstance(end, str): stripped = end.strip() if stripped == ")": new_text = node[:idx] node.replace_with(new_text) for m in middle: m.extract() end.extract() children = list(el.children) continue i += 1 def remove_orphan_headers(tag): levels = {"h1": 1, "h2": 2, "h3": 3, "h4": 4} for header in list(tag.find_all(["h1", "h2", "h3", "h4"])): level = levels[header.name] nxt = header.find_next_sibling() while nxt and ( (isinstance(nxt, str) and not nxt.strip()) or getattr(nxt, "name", None) == "br" ): nxt = nxt.find_next_sibling() if not nxt: header.decompose() continue if nxt.name in levels and levels[nxt.name] > level: continue if nxt.name in levels and levels[nxt.name] <= level: header.decompose() continue if nxt.get_text(strip=True): continue def is_intro_rule_table(table): text = table.get_text(" ", strip=True).lower() has_structure_signal = ( table.name == "table" and ( "wikitable" in (table.get("class") or []) or table.get("width") == "100%" ) ) has_intro_signal = any(k in text for k in [ "unlimited only", "prime", "legacy army", "mark iii", "mk4 icon", "rest of this page" ]) return has_structure_signal and has_intro_signal def remove_intro_rule_box(content): for el in list(content.children): if isinstance(el, str) and not el.strip(): continue if getattr(el, "name", None) not in ["table", "div", "p"]: break if getattr(el, "name", None) == "table": if is_intro_rule_table(el): el.decompose() continue if getattr(el, "name", None) == "p": break def is_error_page(soup: BeautifulSoup) -> bool: text = soup.get_text(" ", strip=True).lower() return any(p in text for p in ERROR_PAGE_PATTERNS) def build_fallback_html(title: str, filename: str) -> str: safe_title = title or filename.replace("_", " ").replace(".html", "") return f""" {safe_title}

{safe_title}

Lost content (HTTracker) in {filename}

""" # ====================== # CORE FUNCTIONS # ====================== def clean_html_file(input_path: Path, output_path: Path): html_page = input_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html_page, "html.parser") # Remove comments (HTTrack etc.) for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract() # Remove scripts/styles for tag in REMOVE_TAGS: for el in soup.find_all(tag): el.decompose() # Extract main content content = soup.select_one("#mw-content-text") if not content or is_error_page(soup): print(f"[WARN] No content in {input_path.name}") fallback = build_fallback_html( title=soup.title.get_text(strip=True) if soup.title else "", filename=input_path.name ) output_path.write_text(fallback, encoding="utf-8") return remove_intro_rule_box(content) # Detect category pages title = soup.title.string if soup.title else "" if title and "Category:" in title: for el in content.select(".mw-category-generated"): el.decompose() # Remove unwanted blocks for selector in REMOVE_SELECTORS: for el in content.select(selector): el.decompose() # Remove unwanted sections by title (See also, Categories, Subcategories) for header in content.find_all(["h1", "h2", "h3"]): title = header.get_text(strip=True).lower() if is_unwanted_section(title): remove_section(header) # Remove unwanted class-based elements for el in safe_elements(content.find_all(True)): classes = el.attrs.get("class", []) if "mw-collapsible" in classes: if not is_in_rules_clarifications(el): el.decompose() continue if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"): el.decompose() # Remove hidden rows (display:none) for el in safe_elements(content.find_all(True)): style = el.get("style", "") if style and "display: none" in style: el.decompose() # Strip attributes for el in safe_elements(content.find_all(True)): for attr in STRIP_ATTRIBUTES: el.attrs.pop(attr, None) # Remove MediaWiki edit links & sections for el in content.select(".mw-editsection"): el.decompose() for a in content.find_all("a"): href = a.get("href", "") text = a.get_text(strip=True).lower() if "action=edit" not in href and text != "edit": continue parent = a.parent if parent: parent_text = parent.get_text(strip=True) if parent_text.replace(" ", "") in {"(edit)", "(edit)"}: parent.decompose() continue a.decompose() # Remove MediaWiki show/hide links for el in content.select( ".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder" ): el.decompose() for el in content.find_all(string=True): t = el.strip().lower() if any(x in t for x in [ "show/hide", "click expand", "expand to read", ]): el.extract() # Fix links (important for next steps) for a in list(content.find_all("a")): href = a.attrs.get("href") if not href: continue href = href.replace("../", "") href = href.replace("index.php?title=", "") a["href"] = href # Simplify image links for a in list(content.find_all("a", class_="image")): img = a.find("img") if img: a.replace_with(img) for img in content.find_all("img"): src = img.get("src", "") src = src.replace("../", "") img["src"] = src remove_empty_bracket_groups(content) remove_split_empty_parentheses(content) remove_inline_empty_brackets(content) remove_orphan_headers(content) # Output cleaned HTML cleaned_html = content.prettify() output_path.write_text(cleaned_html, encoding="utf-8") def process_all(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) files = list(SOURCE_DIR.glob("*.html")) total = len(files) print(f"{total} fichiers trouvés") for i, file in enumerate(files, start=1): output_file = OUTPUT_DIR / file.name clean_html_file(file, output_file) if i % 200 == 0 or i == total: print(f"{i}/{total} analysés ({i/total:.1%})") print("✅ Cleaning complete") if __name__ == "__main__": process_all()