remove orphan headers

This commit is contained in:
Maxime Réaux 2026-04-15 08:39:40 +02:00
parent 6eeabd7c9d
commit 258886173e

View file

@ -185,6 +185,27 @@ def remove_split_empty_parentheses(tag):
continue
i += 1
def remove_orphan_headers(tag):
levels = {"h1": 1, "h2": 2, "h3": 3, "h4": 4}
for header in list(tag.find_all(["h1", "h2", "h3", "h4"])):
level = levels[header.name]
nxt = header.find_next_sibling()
while nxt and (
(isinstance(nxt, str) and not nxt.strip())
or getattr(nxt, "name", None) == "br"
):
nxt = nxt.find_next_sibling()
if not nxt:
header.decompose()
continue
if nxt.name in levels and levels[nxt.name] > level:
continue
if nxt.name in levels and levels[nxt.name] <= level:
header.decompose()
continue
if nxt.get_text(strip=True):
continue
# ======================
# CORE FUNCTIONS
# ======================
@ -299,6 +320,7 @@ def clean_html_file(input_path: Path, output_path: Path):
remove_empty_bracket_groups(content)
remove_split_empty_parentheses(content)
remove_inline_empty_brackets(content)
remove_orphan_headers(content)
# Output cleaned HTML
cleaned_html = content.prettify()