extract page content WIP
This commit is contained in:
parent
0aace3dfc8
commit
c9fb3513ee
3 changed files with 226 additions and 1 deletions
|
|
@ -9,7 +9,7 @@ from difflib import SequenceMatcher
|
|||
from bs4 import BeautifulSoup
|
||||
import unicodedata
|
||||
|
||||
SOURCE_DIR = Path("../original_index")
|
||||
SOURCE_DIR = Path("../test")
|
||||
OUTPUT_DIR = Path("../output")
|
||||
|
||||
PAGES_DIR = Path(OUTPUT_DIR / "pages")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue