Files
ppg-modkit/parsing_docs/extract_wiki_links.py

42 lines
1.2 KiB
Python
Executable File

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "bs4",
# "requests",
# ]
# ///
from bs4 import BeautifulSoup
import sys
import requests
import os
def download_page(url, filename):
"""Download HTML page and save to local file"""
if os.path.exists(filename):
print(f"Using cached {filename}")
return open(filename).read()
print(f"Downloading {url}...")
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
with open(filename, 'w', encoding='utf-8') as f:
f.write(response.text)
return response.text
except requests.RequestException as e:
print(f"Error downloading {url}: {e}")
sys.exit(1)
# Download the internal reference page
ref_url = 'https://wiki.studiominus.nl/internalReference.html'
html = download_page(ref_url, 'internalReference.html')
soup = BeautifulSoup(html, 'html.parser')
with open('source_documentation.txt', 'w') as f:
for link in soup.select('li p a[href^=\"/internalReference/\"]'):
url = f"https://wiki.studiominus.nl{link['href']}"
title = link.get_text().strip()
f.write(f'{url} - {title}\n')