From 258886173efeb2434734875705b2ca0f123d1a82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maxime=20R=C3=A9aux?= <maxime.reaux@advans-group.com>
Date: Wed, 15 Apr 2026 08:39:40 +0200
Subject: [PATCH] remove orphan headers

---
 extract_content.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/extract_content.py b/extract_content.py
index 58c18ff..93da286 100644
--- a/extract_content.py
+++ b/extract_content.py
@@ -185,6 +185,27 @@ def remove_split_empty_parentheses(tag):
                                 continue
             i += 1
 
+def remove_orphan_headers(tag):
+    levels = {"h1": 1, "h2": 2, "h3": 3, "h4": 4}
+    for header in list(tag.find_all(["h1", "h2", "h3", "h4"])):
+        level = levels[header.name]
+        nxt = header.find_next_sibling()
+        while nxt and (
+            (isinstance(nxt, str) and not nxt.strip())
+            or getattr(nxt, "name", None) == "br"
+        ):
+            nxt = nxt.find_next_sibling()
+        if not nxt:
+            header.decompose()
+            continue
+        if nxt.name in levels and levels[nxt.name] > level:
+            continue
+        if nxt.name in levels and levels[nxt.name] <= level:
+            header.decompose()
+            continue
+        if nxt.get_text(strip=True):
+            continue
+
 # ======================
 # CORE FUNCTIONS
 # ======================
@@ -299,6 +320,7 @@ def clean_html_file(input_path: Path, output_path: Path):
     remove_empty_bracket_groups(content)
     remove_split_empty_parentheses(content)
     remove_inline_empty_brackets(content)
+    remove_orphan_headers(content)
 
     # Output cleaned HTML
     cleaned_html = content.prettify()