remove orphan headers
This commit is contained in:
parent
6eeabd7c9d
commit
258886173e
1 changed files with 22 additions and 0 deletions
|
|
@ -185,6 +185,27 @@ def remove_split_empty_parentheses(tag):
|
||||||
continue
|
continue
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
|
def remove_orphan_headers(tag):
|
||||||
|
levels = {"h1": 1, "h2": 2, "h3": 3, "h4": 4}
|
||||||
|
for header in list(tag.find_all(["h1", "h2", "h3", "h4"])):
|
||||||
|
level = levels[header.name]
|
||||||
|
nxt = header.find_next_sibling()
|
||||||
|
while nxt and (
|
||||||
|
(isinstance(nxt, str) and not nxt.strip())
|
||||||
|
or getattr(nxt, "name", None) == "br"
|
||||||
|
):
|
||||||
|
nxt = nxt.find_next_sibling()
|
||||||
|
if not nxt:
|
||||||
|
header.decompose()
|
||||||
|
continue
|
||||||
|
if nxt.name in levels and levels[nxt.name] > level:
|
||||||
|
continue
|
||||||
|
if nxt.name in levels and levels[nxt.name] <= level:
|
||||||
|
header.decompose()
|
||||||
|
continue
|
||||||
|
if nxt.get_text(strip=True):
|
||||||
|
continue
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
# CORE FUNCTIONS
|
# CORE FUNCTIONS
|
||||||
# ======================
|
# ======================
|
||||||
|
|
@ -299,6 +320,7 @@ def clean_html_file(input_path: Path, output_path: Path):
|
||||||
remove_empty_bracket_groups(content)
|
remove_empty_bracket_groups(content)
|
||||||
remove_split_empty_parentheses(content)
|
remove_split_empty_parentheses(content)
|
||||||
remove_inline_empty_brackets(content)
|
remove_inline_empty_brackets(content)
|
||||||
|
remove_orphan_headers(content)
|
||||||
|
|
||||||
# Output cleaned HTML
|
# Output cleaned HTML
|
||||||
cleaned_html = content.prettify()
|
cleaned_html = content.prettify()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue