feat: add initial People Playground mod development kit

This commit is contained in:
2026-01-06 06:35:51 +03:00
parent b89c805060
commit 10dbfd434c
1095 changed files with 40267 additions and 1 deletions

303
parsing_docs/process_wiki_urls.py Executable file
View File

@@ -0,0 +1,303 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "playwright",
# ]
# ///
"""
Script to extract content from People Playground wiki pages.
Processes URLs from wiki_links.txt and saves content to text files.
"""
import os
import re
import time
import argparse
from playwright.sync_api import sync_playwright
def get_category_from_url(url):
"""Extract category from wiki URL and return folder name."""
match = re.search(r'https://wiki\.studiominus\.nl/([^/]+)/', url)
if match:
category = match.group(1)
# Map wiki categories to folder names
folder_map = {
'internalReference': 'internalReference',
'tutorials': 'tutorials',
'snippets': 'snippets',
'details': 'details'
}
return folder_map.get(category, '')
return ''
def create_safe_filename(title, url):
"""Create a safe filename from title and URL."""
# Remove the common prefix from the title
clean_title = title.replace('People Playground Modding - ', '')
# If title is empty or just the prefix, use the URL path
if not clean_title or clean_title == title:
path_match = re.search(r'/([^/]+)\.html$', url)
if path_match:
clean_title = path_match.group(1)
else:
clean_title = 'page'
# Replace spaces and special characters with underscores
safe_name = re.sub(r'[^a-z0-9]+', '_', clean_title.lower())
safe_name = re.sub(r'^_+|_+$', '', safe_name)
safe_name = re.sub(r'_+', '_', safe_name)
return f"{safe_name}.txt"
def is_file_already_extracted(filename, output_dir, expected_url):
"""Check if a file has already been extracted and contains the expected URL."""
filepath = os.path.join(output_dir, filename)
if not os.path.exists(filepath):
return False
try:
with open(filepath, 'r', encoding='utf-8') as f:
first_line = f.readline().strip()
# Check if the first line contains the expected URL
return first_line == f"URL: {expected_url}"
except (IOError, UnicodeDecodeError):
return False
def extract_content_from_page(page):
"""Extract main content from the current page."""
return page.evaluate("""() => {
const mainContent = document.querySelector('.page#markdown-result');
if (mainContent) {
// Clone the main content to avoid modifying the original
const contentClone = mainContent.cloneNode(true);
// Remove any script elements from the clone
const scripts = contentClone.querySelectorAll('script');
scripts.forEach(script => script.remove());
// Remove any style elements from the clone
const styles = contentClone.querySelectorAll('style');
styles.forEach(style => style.remove());
// Remove the obsolete message if it exists
const obsoleteMsg = contentClone.querySelector('.obsolete-message');
if (obsoleteMsg) {
obsoleteMsg.remove();
}
return {
title: document.title,
content: contentClone.innerText.trim(),
html: contentClone.innerHTML.trim()
};
} else {
return {
title: document.title,
content: "Main content area not found",
html: ""
};
}
}""")
def process_wiki_urls(input_file=None, summary_name='summary.txt'):
"""Process all wiki URLs and extract content."""
lines = []
if input_file:
# Use specified input file
input_path = input_file
if os.path.exists(input_path):
with open(input_path, 'r') as f:
lines = f.readlines()
print(f"Processing URLs from: {input_path}")
else:
print(f"Error: Specified input file '{input_path}' not found!")
return
else:
# Auto-detect input files
input_files = []
# Check for sidebar.txt
if os.path.exists('sidebar.txt'):
input_files.append('sidebar.txt')
# Check for source_documentation.txt
if os.path.exists('source_documentation.txt'):
input_files.append('source_documentation.txt')
if not input_files:
print("No input files found! Looking for sidebar.txt or source_documentation.txt")
print("Usage: python process_wiki_urls.py [--input <filename>]")
return
# Read from all found input files
for file_path in input_files:
print(f"Processing URLs from: {file_path}")
with open(file_path, 'r') as f:
lines.extend(f.readlines())
if not lines:
print("No URLs found in input files!")
return
# Parse URLs and titles
urls_and_titles = []
for line in lines:
line = line.strip()
if line and ' - ' in line:
url, title = line.split(' - ', 1)
urls_and_titles.append((url.strip(), title.strip()))
# Create base output directory in parent directory
base_output_dir = '../extracted_wiki_content'
os.makedirs(base_output_dir, exist_ok=True)
# Summary file goes in the same directory as this script
script_dir = os.path.dirname(os.path.abspath(__file__))
summary_output_dir = script_dir
# Summary data
summary = []
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
try:
for i, (url, expected_title) in enumerate(urls_and_titles, 1):
print(f"Processing {i}/{len(urls_and_titles)}: {url}")
try:
# Try to predict filename from expected title first
temp_filename = create_safe_filename(expected_title, url)
# Get category for the expected filename check
temp_category = get_category_from_url(url)
if temp_category:
temp_output_dir = os.path.join(base_output_dir, temp_category)
else:
temp_output_dir = base_output_dir
# Check if already extracted
if is_file_already_extracted(temp_filename, temp_output_dir, url):
category_path = f"{temp_category}/" if temp_category else ""
print(f" ✓ Skipped (already extracted to {category_path}{temp_filename})")
summary.append({
'url': url,
'title': expected_title,
'expected_title': expected_title,
'filename': temp_filename,
'content_length': 0,
'skipped': True
})
continue
# Navigate to the page
page.goto(url, wait_until='networkidle', timeout=30000)
# Extract content
extracted_data = extract_content_from_page(page)
# Create filename from actual extracted title
filename = create_safe_filename(extracted_data['title'], url)
# Get category and create appropriate folder
category = get_category_from_url(url)
if category:
output_dir = os.path.join(base_output_dir, category)
os.makedirs(output_dir, exist_ok=True)
else:
output_dir = base_output_dir
filepath = os.path.join(output_dir, filename)
# Double-check with actual filename (in case title differs from expected)
if filename != temp_filename and is_file_already_extracted(filename, output_dir, url):
print(f" ✓ Skipped (already extracted to {category}/{filename})")
summary.append({
'url': url,
'title': extracted_data['title'],
'expected_title': expected_title,
'filename': filename,
'content_length': 0,
'skipped': True
})
continue
# Save content to file
with open(filepath, 'w', encoding='utf-8') as f:
f.write(f"URL: {url}\n")
f.write(f"Title: {extracted_data['title']}\n")
f.write("=" * 50 + "\n\n")
f.write(extracted_data['content'])
# Add to summary
summary.append({
'url': url,
'title': extracted_data['title'],
'expected_title': expected_title,
'filename': filename,
'content_length': len(extracted_data['content'])
})
category_path = f"{category}/" if category else ""
print(f" ✓ Saved to {category_path}{filename}")
# Small delay to be respectful
time.sleep(1)
except Exception as e:
print(f" ✗ Error processing {url}: {e}")
summary.append({
'url': url,
'title': 'ERROR',
'expected_title': expected_title,
'filename': 'ERROR',
'content_length': 0,
'error': str(e)
})
finally:
browser.close()
# Create summary file
summary_path = os.path.join(summary_output_dir, summary_name)
with open(summary_path, 'w', encoding='utf-8') as f:
f.write("People Playground Wiki Content Extraction Summary\n")
f.write("=" * 50 + "\n\n")
f.write(f"Total URLs processed: {len(summary)}\n")
f.write(f"Skipped (already extracted): {len([s for s in summary if s.get('skipped', False)])}\n")
f.write(f"Successful extractions: {len([s for s in summary if s['title'] != 'ERROR' and not s.get('skipped', False)])}\n")
f.write(f"Failed extractions: {len([s for s in summary if s['title'] == 'ERROR'])}\n\n")
f.write("Extracted Pages:\n")
f.write("-" * 30 + "\n")
for item in summary:
if item.get('skipped', False):
f.write(f"• SKIPPED: {item['expected_title']} ({item['filename']})\n")
elif item['title'] != 'ERROR':
f.write(f"{item['expected_title']} ({item['filename']})\n")
else:
f.write(f"• ERROR: {item['expected_title']} - {item['error']}\n")
skipped_count = len([s for s in summary if s.get('skipped', False)])
extracted_count = len([s for s in summary if s['title'] != 'ERROR' and not s.get('skipped', False)])
print(f"\nExtraction complete! Summary saved to {summary_path}")
print(f"Total pages extracted: {extracted_count}")
if skipped_count > 0:
print(f"Pages skipped (already extracted): {skipped_count}")
print(f"Failed extractions: {len([s for s in summary if s['title'] == 'ERROR'])}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract content from People Playground wiki pages')
parser.add_argument('--input', '-i', help='Input file containing URLs (default: auto-detect sidebar.txt and source_documentation.txt)')
parser.add_argument('--summary-name', '-n', default='summary.txt', help='Name for summary file (default: summary.txt)')
args = parser.parse_args()
process_wiki_urls(args.input, args.summary_name)