extract page content WIP
This commit is contained in:
parent
0aace3dfc8
commit
c9fb3513ee
3 changed files with 226 additions and 1 deletions
225
extract_content.py
Normal file
225
extract_content.py
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
# ======================
|
||||
# CONFIG
|
||||
# ======================
|
||||
|
||||
SOURCE_DIR = Path("../output/pages")
|
||||
OUTPUT_DIR = Path("../output/cleaned_pages")
|
||||
|
||||
REMOVE_SELECTORS = [
|
||||
"#toc",
|
||||
".toc",
|
||||
".mw-editsection",
|
||||
".reference",
|
||||
".noprint",
|
||||
".mw-jump",
|
||||
"#catlinks",
|
||||
".printfooter",
|
||||
".mw-category",
|
||||
".mw-normal-catlinks",
|
||||
".mw-category-generated",
|
||||
"#mw-subcategories",
|
||||
"#mw-pages"
|
||||
]
|
||||
|
||||
REMOVE_TAGS = [
|
||||
"script",
|
||||
"style",
|
||||
]
|
||||
|
||||
SECTION_TITLES_TO_REMOVE = [
|
||||
"see also",
|
||||
"subcategories",
|
||||
"categories",
|
||||
]
|
||||
|
||||
REMOVE_CLASSES_CONTAINS = [
|
||||
"navbox",
|
||||
"vertical-navbox",
|
||||
"mw-collapsible",
|
||||
]
|
||||
|
||||
STRIP_ATTRIBUTES = [
|
||||
"style",
|
||||
"class",
|
||||
"width",
|
||||
"height",
|
||||
"border",
|
||||
]
|
||||
|
||||
# ======================
|
||||
# HELPERS
|
||||
# ======================
|
||||
|
||||
def safe_elements(elements):
|
||||
for el in list(elements):
|
||||
if getattr(el, "attrs", None):
|
||||
yield el
|
||||
|
||||
def is_unwanted_section(title: str) -> bool:
|
||||
title = title.lower()
|
||||
return any(t in title for t in SECTION_TITLES_TO_REMOVE)
|
||||
|
||||
def remove_section(header):
|
||||
el = header
|
||||
while True:
|
||||
next_el = el.find_next_sibling()
|
||||
el.decompose()
|
||||
if not next_el or next_el.name in ["h1", "h2", "h3"]:
|
||||
break
|
||||
el = next_el
|
||||
|
||||
def is_bad_collapsible(el):
|
||||
classes = el.get("class", [])
|
||||
return (
|
||||
"mw-collapsible" in classes
|
||||
and "navbox" in classes # ou autre critère
|
||||
)
|
||||
|
||||
|
||||
def is_in_rules_clarifications(el):
|
||||
prev = el.find_previous(["h1", "h2", "h3"])
|
||||
if not prev:
|
||||
return False
|
||||
title = prev.get_text(strip=True).lower()
|
||||
return "rules clarification" in title
|
||||
|
||||
# ======================
|
||||
# CORE FUNCTIONS
|
||||
# ======================
|
||||
|
||||
def clean_html_file(input_path: Path, output_path: Path):
|
||||
html = input_path.read_text(encoding="utf-8", errors="ignore")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Remove comments (HTTrack etc.)
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
||||
# Remove scripts/styles
|
||||
for tag in REMOVE_TAGS:
|
||||
for el in soup.find_all(tag):
|
||||
el.decompose()
|
||||
|
||||
|
||||
# Extract main content
|
||||
content = soup.select_one("#mw-content-text")
|
||||
if not content:
|
||||
print(f"[WARN] No content in {input_path.name}")
|
||||
return
|
||||
|
||||
# Detect category pages
|
||||
title = soup.title.string if soup.title else ""
|
||||
if title and "Category:" in title:
|
||||
for el in content.select(".mw-category-generated"):
|
||||
el.decompose()
|
||||
|
||||
# Remove unwanted blocks
|
||||
for selector in REMOVE_SELECTORS:
|
||||
for el in content.select(selector):
|
||||
el.decompose()
|
||||
|
||||
# Remove unwanted sections by title (See also, Categories, Subcategories)
|
||||
for header in content.find_all(["h1", "h2", "h3"]):
|
||||
title = header.get_text(strip=True).lower()
|
||||
if is_unwanted_section(title):
|
||||
remove_section(header)
|
||||
|
||||
# Remove unwanted class-based elements
|
||||
for el in safe_elements(content.find_all(True)):
|
||||
classes = el.attrs.get("class", [])
|
||||
if "mw-collapsible" in classes:
|
||||
if not is_in_rules_clarifications(el):
|
||||
el.decompose()
|
||||
continue
|
||||
if any(rc in c for c in classes for rc in REMOVE_CLASSES_CONTAINS if rc != "mw-collapsible"):
|
||||
el.decompose()
|
||||
|
||||
# Remove hidden rows (display:none)
|
||||
for el in safe_elements(content.find_all(True)):
|
||||
style = el.get("style", "")
|
||||
if style and "display: none" in style:
|
||||
el.decompose()
|
||||
|
||||
# Strip attributes
|
||||
for el in safe_elements(content.find_all(True)):
|
||||
for attr in STRIP_ATTRIBUTES:
|
||||
el.attrs.pop(attr, None)
|
||||
|
||||
# Remove MediaWiki edit links & sections
|
||||
for el in content.select(".mw-editsection"):
|
||||
el.decompose()
|
||||
for a in content.find_all("a"):
|
||||
href = a.get("href", "")
|
||||
text = a.get_text(strip=True).lower()
|
||||
if "action=edit" not in href and text != "edit":
|
||||
continue
|
||||
parent = a.parent
|
||||
if parent:
|
||||
parent_text = parent.get_text(strip=True)
|
||||
if parent_text.replace(" ", "") in {"(edit)", "(edit)"}:
|
||||
parent.decompose()
|
||||
continue
|
||||
a.decompose()
|
||||
|
||||
for el in content.find_all(string=True):
|
||||
t = el.strip()
|
||||
if t in {"(", ")"}:
|
||||
if len(el.parent.get_text(strip=True)) <= 2:
|
||||
el.extract()
|
||||
|
||||
# Remove MediaWiki show/hide links
|
||||
for el in content.select(
|
||||
".mw-collapsible-toggle, .mw-collapsible-text, .mw-collapsible-toggle-placeholder"
|
||||
):
|
||||
el.decompose()
|
||||
for el in content.find_all(string=True):
|
||||
t = el.strip().lower()
|
||||
if any(x in t for x in [
|
||||
"show/hide",
|
||||
"click expand",
|
||||
"expand to read",
|
||||
"( )",
|
||||
"[ ]"
|
||||
]):
|
||||
el.extract()
|
||||
|
||||
# Fix links (important for next steps)
|
||||
for a in list(content.find_all("a")):
|
||||
href = a.attrs.get("href")
|
||||
if not href:
|
||||
continue
|
||||
href = href.replace("../", "")
|
||||
href = href.replace("index.php?title=", "")
|
||||
a["href"] = href
|
||||
|
||||
# Simplify image links
|
||||
for a in list(content.find_all("a", class_="image")):
|
||||
img = a.find("img")
|
||||
if img:
|
||||
a.replace_with(img)
|
||||
for img in content.find_all("img"):
|
||||
src = img.get("src", "")
|
||||
src = src.replace("../", "")
|
||||
img["src"] = src
|
||||
|
||||
# Output cleaned HTML
|
||||
cleaned_html = content.prettify()
|
||||
output_path.write_text(cleaned_html, encoding="utf-8")
|
||||
|
||||
|
||||
def process_all():
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for file in SOURCE_DIR.glob("*.html"):
|
||||
output_file = OUTPUT_DIR / file.name
|
||||
clean_html_file(file, output_file)
|
||||
|
||||
print("✅ Cleaning complete")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_all()
|
||||
Loading…
Add table
Add a link
Reference in a new issue