feat: add initial People Playground mod development kit

2026-01-06 06:35:51 +03:00
parent b89c805060
commit 10dbfd434c
1095 changed files with 40267 additions and 1 deletions
--- a/parsing_docs/process_wiki_urls.py
+++ b/parsing_docs/process_wiki_urls.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#   "playwright",
+# ]
+# ///
+
+"""
+Script to extract content from People Playground wiki pages.
+Processes URLs from wiki_links.txt and saves content to text files.
+"""
+
+import os
+import re
+import time
+import argparse
+from playwright.sync_api import sync_playwright
+
+def get_category_from_url(url):
+    """Extract category from wiki URL and return folder name."""
+    match = re.search(r'https://wiki\.studiominus\.nl/([^/]+)/', url)
+    if match:
+        category = match.group(1)
+        # Map wiki categories to folder names
+        folder_map = {
+            'internalReference': 'internalReference',
+            'tutorials': 'tutorials',
+            'snippets': 'snippets',
+            'details': 'details'
+        }
+        return folder_map.get(category, '')
+    return ''
+
+
+def create_safe_filename(title, url):
+    """Create a safe filename from title and URL."""
+    # Remove the common prefix from the title
+    clean_title = title.replace('People Playground Modding - ', '')
+
+    # If title is empty or just the prefix, use the URL path
+    if not clean_title or clean_title == title:
+        path_match = re.search(r'/([^/]+)\.html$', url)
+        if path_match:
+            clean_title = path_match.group(1)
+        else:
+            clean_title = 'page'
+
+    # Replace spaces and special characters with underscores
+    safe_name = re.sub(r'[^a-z0-9]+', '_', clean_title.lower())
+    safe_name = re.sub(r'^_+|_+$', '', safe_name)
+    safe_name = re.sub(r'_+', '_', safe_name)
+
+    return f"{safe_name}.txt"
+
+def is_file_already_extracted(filename, output_dir, expected_url):
+    """Check if a file has already been extracted and contains the expected URL."""
+    filepath = os.path.join(output_dir, filename)
+    if not os.path.exists(filepath):
+        return False
+
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            first_line = f.readline().strip()
+            # Check if the first line contains the expected URL
+            return first_line == f"URL: {expected_url}"
+    except (IOError, UnicodeDecodeError):
+        return False
+
+def extract_content_from_page(page):
+    """Extract main content from the current page."""
+    return page.evaluate("""() => {
+        const mainContent = document.querySelector('.page#markdown-result');
+
+        if (mainContent) {
+            // Clone the main content to avoid modifying the original
+            const contentClone = mainContent.cloneNode(true);
+
+            // Remove any script elements from the clone
+            const scripts = contentClone.querySelectorAll('script');
+            scripts.forEach(script => script.remove());
+
+            // Remove any style elements from the clone
+            const styles = contentClone.querySelectorAll('style');
+            styles.forEach(style => style.remove());
+
+            // Remove the obsolete message if it exists
+            const obsoleteMsg = contentClone.querySelector('.obsolete-message');
+            if (obsoleteMsg) {
+                obsoleteMsg.remove();
+            }
+
+            return {
+                title: document.title,
+                content: contentClone.innerText.trim(),
+                html: contentClone.innerHTML.trim()
+            };
+        } else {
+            return {
+                title: document.title,
+                content: "Main content area not found",
+                html: ""
+            };
+        }
+    }""")
+
+def process_wiki_urls(input_file=None, summary_name='summary.txt'):
+    """Process all wiki URLs and extract content."""
+    lines = []
+
+    if input_file:
+        # Use specified input file
+        input_path = input_file
+        if os.path.exists(input_path):
+            with open(input_path, 'r') as f:
+                lines = f.readlines()
+            print(f"Processing URLs from: {input_path}")
+        else:
+            print(f"Error: Specified input file '{input_path}' not found!")
+            return
+    else:
+        # Auto-detect input files
+        input_files = []
+
+        # Check for sidebar.txt
+        if os.path.exists('sidebar.txt'):
+            input_files.append('sidebar.txt')
+
+        # Check for source_documentation.txt
+        if os.path.exists('source_documentation.txt'):
+            input_files.append('source_documentation.txt')
+
+        if not input_files:
+            print("No input files found! Looking for sidebar.txt or source_documentation.txt")
+            print("Usage: python process_wiki_urls.py [--input <filename>]")
+            return
+
+        # Read from all found input files
+        for file_path in input_files:
+            print(f"Processing URLs from: {file_path}")
+            with open(file_path, 'r') as f:
+                lines.extend(f.readlines())
+
+    if not lines:
+        print("No URLs found in input files!")
+        return
+
+    # Parse URLs and titles
+    urls_and_titles = []
+    for line in lines:
+        line = line.strip()
+        if line and ' - ' in line:
+            url, title = line.split(' - ', 1)
+            urls_and_titles.append((url.strip(), title.strip()))
+
+    # Create base output directory in parent directory
+    base_output_dir = '../extracted_wiki_content'
+    os.makedirs(base_output_dir, exist_ok=True)
+
+    # Summary file goes in the same directory as this script
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    summary_output_dir = script_dir
+
+    # Summary data
+    summary = []
+
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+
+        try:
+            for i, (url, expected_title) in enumerate(urls_and_titles, 1):
+                print(f"Processing {i}/{len(urls_and_titles)}: {url}")
+
+                try:
+                    # Try to predict filename from expected title first
+                    temp_filename = create_safe_filename(expected_title, url)
+
+                    # Get category for the expected filename check
+                    temp_category = get_category_from_url(url)
+                    if temp_category:
+                        temp_output_dir = os.path.join(base_output_dir, temp_category)
+                    else:
+                        temp_output_dir = base_output_dir
+
+                    # Check if already extracted
+                    if is_file_already_extracted(temp_filename, temp_output_dir, url):
+                        category_path = f"{temp_category}/" if temp_category else ""
+                        print(f"  ✓ Skipped (already extracted to {category_path}{temp_filename})")
+                        summary.append({
+                            'url': url,
+                            'title': expected_title,
+                            'expected_title': expected_title,
+                            'filename': temp_filename,
+                            'content_length': 0,
+                            'skipped': True
+                        })
+                        continue
+
+                    # Navigate to the page
+                    page.goto(url, wait_until='networkidle', timeout=30000)
+
+                    # Extract content
+                    extracted_data = extract_content_from_page(page)
+
+                    # Create filename from actual extracted title
+                    filename = create_safe_filename(extracted_data['title'], url)
+
+                    # Get category and create appropriate folder
+                    category = get_category_from_url(url)
+                    if category:
+                        output_dir = os.path.join(base_output_dir, category)
+                        os.makedirs(output_dir, exist_ok=True)
+                    else:
+                        output_dir = base_output_dir
+
+                    filepath = os.path.join(output_dir, filename)
+
+                    # Double-check with actual filename (in case title differs from expected)
+                    if filename != temp_filename and is_file_already_extracted(filename, output_dir, url):
+                        print(f"  ✓ Skipped (already extracted to {category}/{filename})")
+                        summary.append({
+                            'url': url,
+                            'title': extracted_data['title'],
+                            'expected_title': expected_title,
+                            'filename': filename,
+                            'content_length': 0,
+                            'skipped': True
+                        })
+                        continue
+
+                    # Save content to file
+                    with open(filepath, 'w', encoding='utf-8') as f:
+                        f.write(f"URL: {url}\n")
+                        f.write(f"Title: {extracted_data['title']}\n")
+                        f.write("=" * 50 + "\n\n")
+                        f.write(extracted_data['content'])
+
+                    # Add to summary
+                    summary.append({
+                        'url': url,
+                        'title': extracted_data['title'],
+                        'expected_title': expected_title,
+                        'filename': filename,
+                        'content_length': len(extracted_data['content'])
+                    })
+
+                    category_path = f"{category}/" if category else ""
+                    print(f"  ✓ Saved to {category_path}{filename}")
+
+                    # Small delay to be respectful
+                    time.sleep(1)
+
+                except Exception as e:
+                    print(f"  ✗ Error processing {url}: {e}")
+                    summary.append({
+                        'url': url,
+                        'title': 'ERROR',
+                        'expected_title': expected_title,
+                        'filename': 'ERROR',
+                        'content_length': 0,
+                        'error': str(e)
+                    })
+
+        finally:
+            browser.close()
+
+    # Create summary file
+    summary_path = os.path.join(summary_output_dir, summary_name)
+    with open(summary_path, 'w', encoding='utf-8') as f:
+        f.write("People Playground Wiki Content Extraction Summary\n")
+        f.write("=" * 50 + "\n\n")
+        f.write(f"Total URLs processed: {len(summary)}\n")
+        f.write(f"Skipped (already extracted): {len([s for s in summary if s.get('skipped', False)])}\n")
+        f.write(f"Successful extractions: {len([s for s in summary if s['title'] != 'ERROR' and not s.get('skipped', False)])}\n")
+        f.write(f"Failed extractions: {len([s for s in summary if s['title'] == 'ERROR'])}\n\n")
+
+        f.write("Extracted Pages:\n")
+        f.write("-" * 30 + "\n")
+        for item in summary:
+            if item.get('skipped', False):
+                f.write(f"• SKIPPED: {item['expected_title']} ({item['filename']})\n")
+            elif item['title'] != 'ERROR':
+                f.write(f"• {item['expected_title']} ({item['filename']})\n")
+            else:
+                f.write(f"• ERROR: {item['expected_title']} - {item['error']}\n")
+
+    skipped_count = len([s for s in summary if s.get('skipped', False)])
+    extracted_count = len([s for s in summary if s['title'] != 'ERROR' and not s.get('skipped', False)])
+
+    print(f"\nExtraction complete! Summary saved to {summary_path}")
+    print(f"Total pages extracted: {extracted_count}")
+    if skipped_count > 0:
+        print(f"Pages skipped (already extracted): {skipped_count}")
+    print(f"Failed extractions: {len([s for s in summary if s['title'] == 'ERROR'])}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Extract content from People Playground wiki pages')
+    parser.add_argument('--input', '-i', help='Input file containing URLs (default: auto-detect sidebar.txt and source_documentation.txt)')
+    parser.add_argument('--summary-name', '-n', default='summary.txt', help='Name for summary file (default: summary.txt)')
+    args = parser.parse_args()
+
+    process_wiki_urls(args.input, args.summary_name)