remove orphan headers
This commit is contained in:
parent
6eeabd7c9d
commit
258886173e
1 changed files with 22 additions and 0 deletions
|
|
@ -185,6 +185,27 @@ def remove_split_empty_parentheses(tag):
|
|||
continue
|
||||
i += 1
|
||||
|
||||
def remove_orphan_headers(tag):
|
||||
levels = {"h1": 1, "h2": 2, "h3": 3, "h4": 4}
|
||||
for header in list(tag.find_all(["h1", "h2", "h3", "h4"])):
|
||||
level = levels[header.name]
|
||||
nxt = header.find_next_sibling()
|
||||
while nxt and (
|
||||
(isinstance(nxt, str) and not nxt.strip())
|
||||
or getattr(nxt, "name", None) == "br"
|
||||
):
|
||||
nxt = nxt.find_next_sibling()
|
||||
if not nxt:
|
||||
header.decompose()
|
||||
continue
|
||||
if nxt.name in levels and levels[nxt.name] > level:
|
||||
continue
|
||||
if nxt.name in levels and levels[nxt.name] <= level:
|
||||
header.decompose()
|
||||
continue
|
||||
if nxt.get_text(strip=True):
|
||||
continue
|
||||
|
||||
# ======================
|
||||
# CORE FUNCTIONS
|
||||
# ======================
|
||||
|
|
@ -299,6 +320,7 @@ def clean_html_file(input_path: Path, output_path: Path):
|
|||
remove_empty_bracket_groups(content)
|
||||
remove_split_empty_parentheses(content)
|
||||
remove_inline_empty_brackets(content)
|
||||
remove_orphan_headers(content)
|
||||
|
||||
# Output cleaned HTML
|
||||
cleaned_html = content.prettify()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue