tweak paths and find links

This commit is contained in:
maximator 2026-04-03 15:50:40 +02:00
parent 36c8bb2354
commit e4aaa33137
7 changed files with 309 additions and 10 deletions

View file

@ -4,8 +4,8 @@ import re
import json
INPUT_DIR = Path(".")
OUTPUT_DIR = Path("unique_pages")
INPUT_DIR = Path("../original_index")
OUTPUT_DIR = Path("../unique_pages")
OUTPUT_DIR.mkdir(exist_ok=True)
ARTICLE_ID_RE = re.compile(r'"wgArticleId":\s*(\d+)')