From 258886173efeb2434734875705b2ca0f123d1a82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maxime=20R=C3=A9aux?= Date: Wed, 15 Apr 2026 08:39:40 +0200 Subject: [PATCH] remove orphan headers --- extract_content.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/extract_content.py b/extract_content.py index 58c18ff..93da286 100644 --- a/extract_content.py +++ b/extract_content.py @@ -185,6 +185,27 @@ def remove_split_empty_parentheses(tag): continue i += 1 +def remove_orphan_headers(tag): + levels = {"h1": 1, "h2": 2, "h3": 3, "h4": 4} + for header in list(tag.find_all(["h1", "h2", "h3", "h4"])): + level = levels[header.name] + nxt = header.find_next_sibling() + while nxt and ( + (isinstance(nxt, str) and not nxt.strip()) + or getattr(nxt, "name", None) == "br" + ): + nxt = nxt.find_next_sibling() + if not nxt: + header.decompose() + continue + if nxt.name in levels and levels[nxt.name] > level: + continue + if nxt.name in levels and levels[nxt.name] <= level: + header.decompose() + continue + if nxt.get_text(strip=True): + continue + # ====================== # CORE FUNCTIONS # ====================== @@ -299,6 +320,7 @@ def clean_html_file(input_path: Path, output_path: Path): remove_empty_bracket_groups(content) remove_split_empty_parentheses(content) remove_inline_empty_brackets(content) + remove_orphan_headers(content) # Output cleaned HTML cleaned_html = content.prettify()