extract page content WIP

This commit is contained in:
Maxime Réaux 2026-04-14 15:15:34 +02:00
parent 0aace3dfc8
commit c9fb3513ee
3 changed files with 226 additions and 1 deletions

View file

@ -9,7 +9,7 @@ from difflib import SequenceMatcher
from bs4 import BeautifulSoup
import unicodedata
SOURCE_DIR = Path("../original_index")
SOURCE_DIR = Path("../test")
OUTPUT_DIR = Path("../output")
PAGES_DIR = Path(OUTPUT_DIR / "pages")