#!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.11" # dependencies = [ # "playwright", # ] # /// """ Script to extract content from People Playground wiki pages. Processes URLs from wiki_links.txt and saves content to text files. """ import os import re import time import argparse from playwright.sync_api import sync_playwright def get_category_from_url(url): """Extract category from wiki URL and return folder name.""" match = re.search(r'https://wiki\.studiominus\.nl/([^/]+)/', url) if match: category = match.group(1) # Map wiki categories to folder names folder_map = { 'internalReference': 'internalReference', 'tutorials': 'tutorials', 'snippets': 'snippets', 'details': 'details' } return folder_map.get(category, '') return '' def create_safe_filename(title, url): """Create a safe filename from title and URL.""" # Remove the common prefix from the title clean_title = title.replace('People Playground Modding - ', '') # If title is empty or just the prefix, use the URL path if not clean_title or clean_title == title: path_match = re.search(r'/([^/]+)\.html$', url) if path_match: clean_title = path_match.group(1) else: clean_title = 'page' # Replace spaces and special characters with underscores safe_name = re.sub(r'[^a-z0-9]+', '_', clean_title.lower()) safe_name = re.sub(r'^_+|_+$', '', safe_name) safe_name = re.sub(r'_+', '_', safe_name) return f"{safe_name}.txt" def is_file_already_extracted(filename, output_dir, expected_url): """Check if a file has already been extracted and contains the expected URL.""" filepath = os.path.join(output_dir, filename) if not os.path.exists(filepath): return False try: with open(filepath, 'r', encoding='utf-8') as f: first_line = f.readline().strip() # Check if the first line contains the expected URL return first_line == f"URL: {expected_url}" except (IOError, UnicodeDecodeError): return False def extract_content_from_page(page): """Extract main content from the current page.""" return page.evaluate("""() => { const mainContent = document.querySelector('.page#markdown-result'); if (mainContent) { // Clone the main content to avoid modifying the original const contentClone = mainContent.cloneNode(true); // Remove any script elements from the clone const scripts = contentClone.querySelectorAll('script'); scripts.forEach(script => script.remove()); // Remove any style elements from the clone const styles = contentClone.querySelectorAll('style'); styles.forEach(style => style.remove()); // Remove the obsolete message if it exists const obsoleteMsg = contentClone.querySelector('.obsolete-message'); if (obsoleteMsg) { obsoleteMsg.remove(); } return { title: document.title, content: contentClone.innerText.trim(), html: contentClone.innerHTML.trim() }; } else { return { title: document.title, content: "Main content area not found", html: "" }; } }""") def process_wiki_urls(input_file=None, summary_name='summary.txt'): """Process all wiki URLs and extract content.""" lines = [] if input_file: # Use specified input file input_path = input_file if os.path.exists(input_path): with open(input_path, 'r') as f: lines = f.readlines() print(f"Processing URLs from: {input_path}") else: print(f"Error: Specified input file '{input_path}' not found!") return else: # Auto-detect input files input_files = [] # Check for sidebar.txt if os.path.exists('sidebar.txt'): input_files.append('sidebar.txt') # Check for source_documentation.txt if os.path.exists('source_documentation.txt'): input_files.append('source_documentation.txt') if not input_files: print("No input files found! Looking for sidebar.txt or source_documentation.txt") print("Usage: python process_wiki_urls.py [--input ]") return # Read from all found input files for file_path in input_files: print(f"Processing URLs from: {file_path}") with open(file_path, 'r') as f: lines.extend(f.readlines()) if not lines: print("No URLs found in input files!") return # Parse URLs and titles urls_and_titles = [] for line in lines: line = line.strip() if line and ' - ' in line: url, title = line.split(' - ', 1) urls_and_titles.append((url.strip(), title.strip())) # Create base output directory in parent directory base_output_dir = '../extracted_wiki_content' os.makedirs(base_output_dir, exist_ok=True) # Summary file goes in the same directory as this script script_dir = os.path.dirname(os.path.abspath(__file__)) summary_output_dir = script_dir # Summary data summary = [] with sync_playwright() as p: browser = p.chromium.launch() page = browser.new_page() try: for i, (url, expected_title) in enumerate(urls_and_titles, 1): print(f"Processing {i}/{len(urls_and_titles)}: {url}") try: # Try to predict filename from expected title first temp_filename = create_safe_filename(expected_title, url) # Get category for the expected filename check temp_category = get_category_from_url(url) if temp_category: temp_output_dir = os.path.join(base_output_dir, temp_category) else: temp_output_dir = base_output_dir # Check if already extracted if is_file_already_extracted(temp_filename, temp_output_dir, url): category_path = f"{temp_category}/" if temp_category else "" print(f" ✓ Skipped (already extracted to {category_path}{temp_filename})") summary.append({ 'url': url, 'title': expected_title, 'expected_title': expected_title, 'filename': temp_filename, 'content_length': 0, 'skipped': True }) continue # Navigate to the page page.goto(url, wait_until='networkidle', timeout=30000) # Extract content extracted_data = extract_content_from_page(page) # Create filename from actual extracted title filename = create_safe_filename(extracted_data['title'], url) # Get category and create appropriate folder category = get_category_from_url(url) if category: output_dir = os.path.join(base_output_dir, category) os.makedirs(output_dir, exist_ok=True) else: output_dir = base_output_dir filepath = os.path.join(output_dir, filename) # Double-check with actual filename (in case title differs from expected) if filename != temp_filename and is_file_already_extracted(filename, output_dir, url): print(f" ✓ Skipped (already extracted to {category}/{filename})") summary.append({ 'url': url, 'title': extracted_data['title'], 'expected_title': expected_title, 'filename': filename, 'content_length': 0, 'skipped': True }) continue # Save content to file with open(filepath, 'w', encoding='utf-8') as f: f.write(f"URL: {url}\n") f.write(f"Title: {extracted_data['title']}\n") f.write("=" * 50 + "\n\n") f.write(extracted_data['content']) # Add to summary summary.append({ 'url': url, 'title': extracted_data['title'], 'expected_title': expected_title, 'filename': filename, 'content_length': len(extracted_data['content']) }) category_path = f"{category}/" if category else "" print(f" ✓ Saved to {category_path}{filename}") # Small delay to be respectful time.sleep(1) except Exception as e: print(f" ✗ Error processing {url}: {e}") summary.append({ 'url': url, 'title': 'ERROR', 'expected_title': expected_title, 'filename': 'ERROR', 'content_length': 0, 'error': str(e) }) finally: browser.close() # Create summary file summary_path = os.path.join(summary_output_dir, summary_name) with open(summary_path, 'w', encoding='utf-8') as f: f.write("People Playground Wiki Content Extraction Summary\n") f.write("=" * 50 + "\n\n") f.write(f"Total URLs processed: {len(summary)}\n") f.write(f"Skipped (already extracted): {len([s for s in summary if s.get('skipped', False)])}\n") f.write(f"Successful extractions: {len([s for s in summary if s['title'] != 'ERROR' and not s.get('skipped', False)])}\n") f.write(f"Failed extractions: {len([s for s in summary if s['title'] == 'ERROR'])}\n\n") f.write("Extracted Pages:\n") f.write("-" * 30 + "\n") for item in summary: if item.get('skipped', False): f.write(f"• SKIPPED: {item['expected_title']} ({item['filename']})\n") elif item['title'] != 'ERROR': f.write(f"• {item['expected_title']} ({item['filename']})\n") else: f.write(f"• ERROR: {item['expected_title']} - {item['error']}\n") skipped_count = len([s for s in summary if s.get('skipped', False)]) extracted_count = len([s for s in summary if s['title'] != 'ERROR' and not s.get('skipped', False)]) print(f"\nExtraction complete! Summary saved to {summary_path}") print(f"Total pages extracted: {extracted_count}") if skipped_count > 0: print(f"Pages skipped (already extracted): {skipped_count}") print(f"Failed extractions: {len([s for s in summary if s['title'] == 'ERROR'])}") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Extract content from People Playground wiki pages') parser.add_argument('--input', '-i', help='Input file containing URLs (default: auto-detect sidebar.txt and source_documentation.txt)') parser.add_argument('--summary-name', '-n', default='summary.txt', help='Name for summary file (default: summary.txt)') args = parser.parse_args() process_wiki_urls(args.input, args.summary_name)