Test new script
This commit is contained in:
714
main.py
714
main.py
@@ -1,21 +1,33 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
RAG Learning System
|
||||||
|
A dual-mode RAG system designed for progressive learning with AI guidance.
|
||||||
|
Tracks your knowledge, suggests new topics, and helps identify learning gaps.
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
import re
|
||||||
|
import yaml
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import deque
|
from collections import deque, defaultdict
|
||||||
from typing import List, Dict
|
from typing import List, Dict, Set
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich.panel import Panel
|
from rich.panel import Panel
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.prompt import Prompt, Confirm
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||||
from prompt_toolkit import PromptSession
|
from prompt_toolkit import PromptSession
|
||||||
from prompt_toolkit.styles import Style
|
from prompt_toolkit.styles import Style
|
||||||
|
|
||||||
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
||||||
|
from langchain_community.vectorstores.utils import filter_complex_metadata
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
from langchain_ollama import OllamaEmbeddings, ChatOllama
|
from langchain_ollama import OllamaEmbeddings, ChatOllama
|
||||||
from langchain_chroma import Chroma
|
from langchain_chroma import Chroma
|
||||||
@@ -27,7 +39,7 @@ from watchdog.observers import Observer
|
|||||||
from watchdog.events import FileSystemEventHandler
|
from watchdog.events import FileSystemEventHandler
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# CONFIG
|
# CONFIGURATION
|
||||||
# =========================
|
# =========================
|
||||||
console = Console(color_system="standard", force_terminal=True)
|
console = Console(color_system="standard", force_terminal=True)
|
||||||
session = PromptSession()
|
session = PromptSession()
|
||||||
@@ -35,28 +47,61 @@ load_dotenv()
|
|||||||
|
|
||||||
style = Style.from_dict({"prompt": "bold #6a0dad"})
|
style = Style.from_dict({"prompt": "bold #6a0dad"})
|
||||||
|
|
||||||
|
# Core Configuration
|
||||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
||||||
ANSWER_COLOR = os.getenv("ANSWER_COLOR", "blue")
|
ANSWER_COLOR = os.getenv("ANSWER_COLOR", "blue")
|
||||||
|
|
||||||
SYSTEM_PROMPT_SEARCH = os.getenv("SYSTEM_PROMPT", "You are a precise technical assistant. Cite sources using [filename]. Be concise.")
|
# Enhanced System Prompts
|
||||||
|
SYSTEM_PROMPT_SEARCH = os.getenv("SYSTEM_PROMPT",
|
||||||
|
"You are a precise technical assistant. Use the provided context to answer questions accurately. "
|
||||||
|
"Cite sources using [filename]. If the context doesn't contain the answer, say so.")
|
||||||
|
|
||||||
SYSTEM_PROMPT_ANALYSIS = (
|
SYSTEM_PROMPT_ANALYSIS = (
|
||||||
"You are an expert tutor and progress evaluator. "
|
"You are an expert learning analytics tutor. Your task is to analyze a student's knowledge base "
|
||||||
"You have access to the student's entire knowledge base below. "
|
"and provide insights about their learning progress.\n\n"
|
||||||
"Analyze the coverage, depth, and connections in the notes. "
|
"When analyzing, consider:\n"
|
||||||
"Identify what the user has learned well, what is missing, and suggest the next logical steps. "
|
"1. What topics/subjects are covered in the notes\n"
|
||||||
"Do not just summarize; evaluate the progress."
|
"2. The depth and complexity of understanding demonstrated\n"
|
||||||
|
"3. Connections between different concepts\n"
|
||||||
|
"4. Gaps or missing fundamental concepts\n"
|
||||||
|
"5. Progression from beginner to advanced topics\n\n"
|
||||||
|
"Provide specific, actionable feedback about:\n"
|
||||||
|
"- What the student has learned well\n"
|
||||||
|
"- Areas that need more attention\n"
|
||||||
|
"- Recommended next topics to study\n"
|
||||||
|
"- How new topics connect to existing knowledge\n\n"
|
||||||
|
"Be encouraging but honest. Format your response clearly with sections."
|
||||||
|
)
|
||||||
|
|
||||||
|
SYSTEM_PROMPT_SUGGESTION = (
|
||||||
|
"You are a learning path advisor. Based on a student's current knowledge (shown in their notes), "
|
||||||
|
"suggest the next logical topics or skills to learn.\n\n"
|
||||||
|
"Your suggestions should:\n"
|
||||||
|
"1. Build upon existing knowledge\n"
|
||||||
|
"2. Fill identified gaps in understanding\n"
|
||||||
|
"3. Progress naturally from basics to advanced\n"
|
||||||
|
"4. Be specific and actionable\n\n"
|
||||||
|
"Format your response with:\n"
|
||||||
|
"- Recommended topics (with brief explanations)\n"
|
||||||
|
"- Prerequisites needed\n"
|
||||||
|
"- Why each topic is important\n"
|
||||||
|
"- Estimated difficulty level\n"
|
||||||
|
"- How it connects to what they already know"
|
||||||
)
|
)
|
||||||
|
|
||||||
USER_PROMPT_TEMPLATE = os.getenv("USER_PROMPT_TEMPLATE",
|
USER_PROMPT_TEMPLATE = os.getenv("USER_PROMPT_TEMPLATE",
|
||||||
"Previous Conversation:\n{history}\n\nContext from Docs:\n{context}\n\nCurrent Question: {question}")
|
"Previous Conversation:\n{history}\n\nContext from Docs:\n{context}\n\nCurrent Question: {question}")
|
||||||
|
|
||||||
|
# Paths and Models
|
||||||
MD_DIRECTORY = os.getenv("MD_FOLDER", "./notes")
|
MD_DIRECTORY = os.getenv("MD_FOLDER", "./notes")
|
||||||
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
|
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "mxbai-embed-large:latest")
|
||||||
LLM_MODEL = os.getenv("LLM_MODEL", "llama3")
|
LLM_MODEL = os.getenv("LLM_MODEL", "qwen2.5:7b-instruct-q8_0")
|
||||||
|
|
||||||
CHROMA_PATH = "./.cache/chroma_db"
|
CHROMA_PATH = "./.cache/chroma_db"
|
||||||
HASH_CACHE = "./.cache/file_hashes.json"
|
HASH_CACHE = "./.cache/file_hashes.json"
|
||||||
|
PROGRESS_CACHE = "./.cache/learning_progress.json"
|
||||||
|
|
||||||
|
# Processing Configuration
|
||||||
MAX_EMBED_CHARS = 380
|
MAX_EMBED_CHARS = 380
|
||||||
CHUNK_SIZE = 1200
|
CHUNK_SIZE = 1200
|
||||||
CHUNK_OVERLAP = 200
|
CHUNK_OVERLAP = 200
|
||||||
@@ -64,47 +109,129 @@ TOP_K = 6
|
|||||||
COLLECTION_NAME = "md_rag"
|
COLLECTION_NAME = "md_rag"
|
||||||
|
|
||||||
MAX_ANALYSIS_CONTEXT_CHARS = 24000
|
MAX_ANALYSIS_CONTEXT_CHARS = 24000
|
||||||
|
|
||||||
BATCH_SIZE = 10
|
BATCH_SIZE = 10
|
||||||
MAX_PARALLEL_FILES = 3
|
MAX_PARALLEL_FILES = 3
|
||||||
|
|
||||||
|
# Learning Configuration
|
||||||
|
MAX_SUGGESTIONS = 5
|
||||||
|
PROGRESS_SUMMARY_DAYS = 7
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# UTILS & CACHE
|
# UTILITY FUNCTIONS
|
||||||
# =========================
|
# =========================
|
||||||
def get_file_hash(file_path: str) -> str:
|
def get_file_hash(file_path: str) -> str:
|
||||||
|
"""Generate MD5 hash for file change detection"""
|
||||||
return hashlib.md5(Path(file_path).read_bytes()).hexdigest()
|
return hashlib.md5(Path(file_path).read_bytes()).hexdigest()
|
||||||
|
|
||||||
def load_hash_cache() -> dict:
|
def load_json_cache(file_path: str) -> dict:
|
||||||
Path(HASH_CACHE).parent.mkdir(parents=True, exist_ok=True)
|
"""Load JSON cache with error handling"""
|
||||||
if Path(HASH_CACHE).exists():
|
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
return json.loads(Path(HASH_CACHE).read_text())
|
if Path(file_path).exists():
|
||||||
|
try:
|
||||||
|
return json.loads(Path(file_path).read_text())
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
console.print(f"[yellow]⚠️ Corrupted cache: {file_path}. Resetting.[/yellow]")
|
||||||
|
return {}
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def save_json_cache(cache: dict, file_path: str):
|
||||||
|
"""Save JSON cache with error handling"""
|
||||||
|
try:
|
||||||
|
Path(file_path).write_text(json.dumps(cache, indent=2))
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]✗ Failed to save cache {file_path}: {e}[/red]")
|
||||||
|
|
||||||
|
def load_hash_cache() -> dict:
|
||||||
|
"""Load file hash cache"""
|
||||||
|
return load_json_cache(HASH_CACHE)
|
||||||
|
|
||||||
def save_hash_cache(cache: dict):
|
def save_hash_cache(cache: dict):
|
||||||
Path(HASH_CACHE).write_text(json.dumps(cache, indent=2))
|
"""Save file hash cache"""
|
||||||
|
save_json_cache(cache, HASH_CACHE)
|
||||||
|
|
||||||
|
def load_progress_cache() -> dict:
|
||||||
|
"""Load learning progress cache"""
|
||||||
|
return load_json_cache(PROGRESS_CACHE)
|
||||||
|
|
||||||
|
def save_progress_cache(cache: dict):
|
||||||
|
"""Save learning progress cache"""
|
||||||
|
save_json_cache(cache, PROGRESS_CACHE)
|
||||||
|
|
||||||
|
def format_file_size(size_bytes: int) -> str:
|
||||||
|
"""Format file size for human reading"""
|
||||||
|
if size_bytes < 1024:
|
||||||
|
return f"{size_bytes} B"
|
||||||
|
elif size_bytes < 1024 * 1024:
|
||||||
|
return f"{size_bytes / 1024:.1f} KB"
|
||||||
|
else:
|
||||||
|
return f"{size_bytes / (1024 * 1024):.1f} MB"
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# ROUTING LOGIC
|
# INTENT CLASSIFICATION
|
||||||
# =========================
|
# =========================
|
||||||
def classify_intent(query: str) -> str:
|
def classify_intent(query: str) -> str:
|
||||||
|
"""
|
||||||
|
Classify user intent into different modes:
|
||||||
|
- SEARCH: Standard RAG retrieval
|
||||||
|
- ANALYSIS: Progress and knowledge analysis
|
||||||
|
- SUGGEST: Topic and learning suggestions
|
||||||
|
- LEARN: Interactive learning mode
|
||||||
|
- STATS: Progress statistics
|
||||||
|
"""
|
||||||
|
query_lower = query.lower().strip()
|
||||||
|
|
||||||
|
# Analysis keywords (progress evaluation)
|
||||||
analysis_keywords = [
|
analysis_keywords = [
|
||||||
r"assess my progress", r"eval(uate)? my (learning|knowledge)",
|
r"assess my progress", r"eval(uate)? my (learning|knowledge)",
|
||||||
r"what have i learned", r"summary of (my )?notes",
|
r"what have i learned", r"summary of (my )?notes",
|
||||||
r"my progress", r"learning path", r"knowledge gap",
|
r"my progress", r"learning path", r"knowledge gap", r"analyze my",
|
||||||
r"оцени (мой )?прогресс", r"что я выучил", r"итоги", r"анализ знаний",
|
r"оцени (мой )?прогресс", r"что я выучил", r"итоги", r"анализ знаний",
|
||||||
r"сегодня(?:\s+\w+)*\s*урок", r"что я изучил"
|
r"сегодня(?:\s+\w+)*\s*урок", r"что я изучил"
|
||||||
]
|
]
|
||||||
|
|
||||||
query_lower = query.lower()
|
# Suggestion keywords
|
||||||
|
suggestion_keywords = [
|
||||||
|
r"what should i learn next", r"suggest (new )?topics", r"recommend (to )?learn",
|
||||||
|
r"next (topics|lessons)", r"learning suggestions", r"what to learn",
|
||||||
|
r"что учить дальше", r"предложи темы", r"рекомендации по обучению"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Stats keywords
|
||||||
|
stats_keywords = [
|
||||||
|
r"show stats", r"learning statistics", r"progress stats", r"knowledge stats",
|
||||||
|
r"статистика обучения", r"прогресс статистика"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Learning mode keywords
|
||||||
|
learn_keywords = [
|
||||||
|
r"start learning", r"learning mode", r"learn new", r"study plan",
|
||||||
|
r"начать обучение", r"режим обучения"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check patterns
|
||||||
for pattern in analysis_keywords:
|
for pattern in analysis_keywords:
|
||||||
if re.search(pattern, query_lower):
|
if re.search(pattern, query_lower):
|
||||||
return "ANALYSIS"
|
return "ANALYSIS"
|
||||||
|
|
||||||
|
for pattern in suggestion_keywords:
|
||||||
|
if re.search(pattern, query_lower):
|
||||||
|
return "SUGGEST"
|
||||||
|
|
||||||
|
for pattern in stats_keywords:
|
||||||
|
if re.search(pattern, query_lower):
|
||||||
|
return "STATS"
|
||||||
|
|
||||||
|
for pattern in learn_keywords:
|
||||||
|
if re.search(pattern, query_lower):
|
||||||
|
return "LEARN"
|
||||||
|
|
||||||
return "SEARCH"
|
return "SEARCH"
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# DOCUMENT PROCESSING
|
# DOCUMENT PROCESSING
|
||||||
# =========================
|
# =========================
|
||||||
def validate_chunk_size(text: str, max_chars: int = MAX_EMBED_CHARS) -> List[str]:
|
def validate_chunk_size(text: str, max_chars: int = MAX_EMBED_CHARS) -> List[str]:
|
||||||
|
"""Split oversized chunks into smaller pieces"""
|
||||||
if len(text) <= max_chars:
|
if len(text) <= max_chars:
|
||||||
return [text]
|
return [text]
|
||||||
|
|
||||||
@@ -134,14 +261,42 @@ def validate_chunk_size(text: str, max_chars: int = MAX_EMBED_CHARS) -> List[str
|
|||||||
if current: chunks.append(current.strip())
|
if current: chunks.append(current.strip())
|
||||||
return [c for c in chunks if c]
|
return [c for c in chunks if c]
|
||||||
|
|
||||||
|
def parse_markdown_with_frontmatter(file_path: str) -> tuple[dict, str]:
|
||||||
|
"""Parse markdown file and extract YAML frontmatter + content"""
|
||||||
|
content = Path(file_path).read_text(encoding='utf-8')
|
||||||
|
|
||||||
|
# YAML frontmatter pattern
|
||||||
|
frontmatter_pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
|
||||||
|
match = re.match(frontmatter_pattern, content, re.DOTALL)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
metadata = yaml.safe_load(match.group(1))
|
||||||
|
metadata = metadata if isinstance(metadata, dict) else {}
|
||||||
|
return metadata, match.group(2)
|
||||||
|
except yaml.YAMLError as e:
|
||||||
|
console.print(f"[yellow]⚠️ YAML error in {Path(file_path).name}: {e}[/yellow]")
|
||||||
|
return {}, content
|
||||||
|
|
||||||
|
return {}, content
|
||||||
|
|
||||||
class ChunkProcessor:
|
class ChunkProcessor:
|
||||||
|
"""Handles document chunking and embedding"""
|
||||||
def __init__(self, vectorstore):
|
def __init__(self, vectorstore):
|
||||||
self.vectorstore = vectorstore
|
self.vectorstore = vectorstore
|
||||||
self.semaphore = asyncio.Semaphore(MAX_PARALLEL_FILES)
|
self.semaphore = asyncio.Semaphore(MAX_PARALLEL_FILES)
|
||||||
|
|
||||||
async def process_file(self, file_path: str) -> List[Dict]:
|
async def process_file(self, file_path: str) -> List[Dict]:
|
||||||
|
"""Process a single markdown file into chunks"""
|
||||||
try:
|
try:
|
||||||
docs = await asyncio.to_thread(UnstructuredMarkdownLoader(file_path).load)
|
metadata, content = parse_markdown_with_frontmatter(file_path)
|
||||||
|
metadata["source"] = file_path
|
||||||
|
|
||||||
|
if metadata.get('exclude'):
|
||||||
|
console.print(f"[dim]📋 Found excluded file: {Path(file_path).name}[/dim]")
|
||||||
|
|
||||||
|
docs = [Document(page_content=content, metadata=metadata)]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
console.print(f"✗ {Path(file_path).name}: {e}", style="red")
|
console.print(f"✗ {Path(file_path).name}: {e}", style="red")
|
||||||
return []
|
return []
|
||||||
@@ -154,28 +309,37 @@ class ChunkProcessor:
|
|||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
for doc_idx, doc in enumerate(docs):
|
for doc_idx, doc in enumerate(docs):
|
||||||
|
doc_metadata = doc.metadata
|
||||||
|
|
||||||
for chunk_idx, text in enumerate(splitter.split_text(doc.page_content)):
|
for chunk_idx, text in enumerate(splitter.split_text(doc.page_content)):
|
||||||
safe_texts = validate_chunk_size(text)
|
safe_texts = validate_chunk_size(text)
|
||||||
for sub_idx, safe_text in enumerate(safe_texts):
|
for sub_idx, safe_text in enumerate(safe_texts):
|
||||||
chunks.append({
|
chunks.append({
|
||||||
"id": f"{file_path}::{doc_idx}::{chunk_idx}::{sub_idx}",
|
"id": f"{file_path}::{doc_idx}::{chunk_idx}::{sub_idx}",
|
||||||
"text": safe_text,
|
"text": safe_text,
|
||||||
"metadata": {"source": file_path, **doc.metadata}
|
"metadata": doc_metadata
|
||||||
})
|
})
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
async def embed_batch(self, batch: List[Dict]) -> bool:
|
async def embed_batch(self, batch: List[Dict]) -> bool:
|
||||||
if not batch: return True
|
"""Embed a batch of chunks"""
|
||||||
|
if not batch:
|
||||||
|
return True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
docs = [Document(page_content=c["text"], metadata=c["metadata"]) for c in batch]
|
docs = [Document(page_content=c["text"], metadata=c["metadata"]) for c in batch]
|
||||||
ids = [c["id"] for c in batch]
|
ids = [c["id"] for c in batch]
|
||||||
|
|
||||||
|
docs = filter_complex_metadata(docs)
|
||||||
await asyncio.to_thread(self.vectorstore.add_documents, docs, ids=ids)
|
await asyncio.to_thread(self.vectorstore.add_documents, docs, ids=ids)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
console.print(f"✗ Embed error: {e}", style="red")
|
console.print(f"✗ Embed error: {e}", style="red")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
async def index_file(self, file_path: str, cache: dict) -> bool:
|
async def index_file(self, file_path: str, cache: dict) -> bool:
|
||||||
|
"""Index a single file with change detection"""
|
||||||
async with self.semaphore:
|
async with self.semaphore:
|
||||||
current_hash = get_file_hash(file_path)
|
current_hash = get_file_hash(file_path)
|
||||||
if cache.get(file_path) == current_hash:
|
if cache.get(file_path) == current_hash:
|
||||||
@@ -184,11 +348,13 @@ class ChunkProcessor:
|
|||||||
chunks = await self.process_file(file_path)
|
chunks = await self.process_file(file_path)
|
||||||
if not chunks: return False
|
if not chunks: return False
|
||||||
|
|
||||||
|
# Remove old chunks for this file
|
||||||
try:
|
try:
|
||||||
self.vectorstore._collection.delete(where={"source": file_path})
|
self.vectorstore._collection.delete(where={"source": {"$eq": file_path}})
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Embed new chunks in batches
|
||||||
for i in range(0, len(chunks), BATCH_SIZE):
|
for i in range(0, len(chunks), BATCH_SIZE):
|
||||||
batch = chunks[i:i + BATCH_SIZE]
|
batch = chunks[i:i + BATCH_SIZE]
|
||||||
await self.embed_batch(batch)
|
await self.embed_batch(batch)
|
||||||
@@ -201,6 +367,7 @@ class ChunkProcessor:
|
|||||||
# FILE WATCHER
|
# FILE WATCHER
|
||||||
# =========================
|
# =========================
|
||||||
class DocumentWatcher(FileSystemEventHandler):
|
class DocumentWatcher(FileSystemEventHandler):
|
||||||
|
"""Watch for file changes and reindex automatically"""
|
||||||
def __init__(self, processor, cache):
|
def __init__(self, processor, cache):
|
||||||
self.processor = processor
|
self.processor = processor
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
@@ -223,6 +390,7 @@ class DocumentWatcher(FileSystemEventHandler):
|
|||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
def start_watcher(processor, cache):
|
def start_watcher(processor, cache):
|
||||||
|
"""Start file system watcher"""
|
||||||
handler = DocumentWatcher(processor, cache)
|
handler = DocumentWatcher(processor, cache)
|
||||||
observer = Observer()
|
observer = Observer()
|
||||||
observer.schedule(handler, MD_DIRECTORY, recursive=True)
|
observer.schedule(handler, MD_DIRECTORY, recursive=True)
|
||||||
@@ -231,9 +399,10 @@ def start_watcher(processor, cache):
|
|||||||
return observer
|
return observer
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# RAG CHAIN FACTORY
|
# CONVERSATION MEMORY
|
||||||
# =========================
|
# =========================
|
||||||
class ConversationMemory:
|
class ConversationMemory:
|
||||||
|
"""Manage conversation history"""
|
||||||
def __init__(self, max_messages: int = 8):
|
def __init__(self, max_messages: int = 8):
|
||||||
self.messages = []
|
self.messages = []
|
||||||
self.max_messages = max_messages
|
self.max_messages = max_messages
|
||||||
@@ -247,7 +416,112 @@ class ConversationMemory:
|
|||||||
if not self.messages: return "No previous conversation."
|
if not self.messages: return "No previous conversation."
|
||||||
return "\n".join([f"{m['role'].upper()}: {m['content']}" for m in self.messages])
|
return "\n".join([f"{m['role'].upper()}: {m['content']}" for m in self.messages])
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# LEARNING ANALYTICS
|
||||||
|
# =========================
|
||||||
|
class LearningAnalytics:
|
||||||
|
"""Analyze learning progress and provide insights"""
|
||||||
|
|
||||||
|
def __init__(self, vectorstore):
|
||||||
|
self.vectorstore = vectorstore
|
||||||
|
|
||||||
|
async def get_knowledge_summary(self) -> dict:
|
||||||
|
"""Get comprehensive knowledge base summary"""
|
||||||
|
try:
|
||||||
|
db_data = await asyncio.to_thread(self.vectorstore.get)
|
||||||
|
|
||||||
|
if not db_data or not db_data['documents']:
|
||||||
|
return {"total_docs": 0, "total_chunks": 0, "subjects": {}}
|
||||||
|
|
||||||
|
# Filter excluded documents
|
||||||
|
filtered_pairs = [
|
||||||
|
(text, meta) for text, meta in zip(db_data['documents'], db_data['metadatas'])
|
||||||
|
if meta and not meta.get('exclude', False)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Extract subjects/topics from file names and content
|
||||||
|
subjects = defaultdict(lambda: {"chunks": 0, "files": set(), "last_updated": None})
|
||||||
|
|
||||||
|
for text, meta in filtered_pairs:
|
||||||
|
source = meta.get('source', 'unknown')
|
||||||
|
filename = Path(source).stem
|
||||||
|
|
||||||
|
# Simple subject extraction from filename
|
||||||
|
subject = filename.split()[0] if filename else 'Unknown'
|
||||||
|
|
||||||
|
subjects[subject]["chunks"] += 1
|
||||||
|
subjects[subject]["files"].add(source)
|
||||||
|
|
||||||
|
# Track last update (simplified)
|
||||||
|
if not subjects[subject]["last_updated"]:
|
||||||
|
subjects[subject]["last_updated"] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
# Convert sets to counts
|
||||||
|
for subject in subjects:
|
||||||
|
subjects[subject]["files"] = len(subjects[subject]["files"])
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_docs": len(filtered_pairs),
|
||||||
|
"total_chunks": len(filtered_pairs),
|
||||||
|
"subjects": dict(subjects)
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]✗ Error getting knowledge summary: {e}[/red]")
|
||||||
|
return {"total_docs": 0, "total_chunks": 0, "subjects": {}}
|
||||||
|
|
||||||
|
async def get_learning_stats(self) -> dict:
|
||||||
|
"""Get detailed learning statistics"""
|
||||||
|
summary = await self.get_knowledge_summary()
|
||||||
|
|
||||||
|
# Load progress history
|
||||||
|
progress_cache = load_progress_cache()
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"total_topics": len(summary["subjects"]),
|
||||||
|
"total_notes": summary["total_docs"],
|
||||||
|
"total_files": sum(s["files"] for s in summary["subjects"].values()),
|
||||||
|
"topics": list(summary["subjects"].keys()),
|
||||||
|
"progress_history": progress_cache.get("sessions", []),
|
||||||
|
"study_streak": self._calculate_streak(progress_cache.get("sessions", [])),
|
||||||
|
"most_productive_topic": self._get_most_productive_topic(summary["subjects"])
|
||||||
|
}
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
def _calculate_streak(self, sessions: list) -> int:
|
||||||
|
"""Calculate consecutive days of studying"""
|
||||||
|
if not sessions:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Simplified streak calculation
|
||||||
|
dates = [datetime.fromisoformat(s.get("date", datetime.now().isoformat())).date()
|
||||||
|
for s in sessions[-10:]] # Last 10 sessions
|
||||||
|
|
||||||
|
streak = 0
|
||||||
|
current_date = datetime.now().date()
|
||||||
|
|
||||||
|
for date in reversed(dates):
|
||||||
|
if (current_date - date).days <= 1:
|
||||||
|
streak += 1
|
||||||
|
current_date = date
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return streak
|
||||||
|
|
||||||
|
def _get_most_productive_topic(self, subjects: dict) -> str:
|
||||||
|
"""Identify the most studied topic"""
|
||||||
|
if not subjects:
|
||||||
|
return "None"
|
||||||
|
|
||||||
|
return max(subjects.items(), key=lambda x: x[1]["chunks"])[0]
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# CHAIN FACTORY
|
||||||
|
# =========================
|
||||||
def get_chain(system_prompt):
|
def get_chain(system_prompt):
|
||||||
|
"""Create a LangChain processing chain"""
|
||||||
llm = ChatOllama(
|
llm = ChatOllama(
|
||||||
model=LLM_MODEL,
|
model=LLM_MODEL,
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
@@ -260,24 +534,256 @@ def get_chain(system_prompt):
|
|||||||
return prompt | llm | StrOutputParser()
|
return prompt | llm | StrOutputParser()
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# MAIN
|
# INTERACTIVE COMMANDS
|
||||||
|
# =========================
|
||||||
|
class InteractiveCommands:
|
||||||
|
"""Handle interactive learning commands"""
|
||||||
|
|
||||||
|
def __init__(self, vectorstore, analytics):
|
||||||
|
self.vectorstore = vectorstore
|
||||||
|
self.analytics = analytics
|
||||||
|
|
||||||
|
async def list_excluded_files(self):
|
||||||
|
"""List all files marked with exclude: true"""
|
||||||
|
console.print("\n[bold yellow]📋 Fetching list of excluded files...[/bold yellow]")
|
||||||
|
|
||||||
|
try:
|
||||||
|
excluded_data = await asyncio.to_thread(
|
||||||
|
self.vectorstore.get,
|
||||||
|
where={"exclude": True}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not excluded_data or not excluded_data['metadatas']:
|
||||||
|
console.print("[green]✓ No files are marked for exclusion.[/green]")
|
||||||
|
return
|
||||||
|
|
||||||
|
excluded_files = set()
|
||||||
|
for meta in excluded_data['metadatas']:
|
||||||
|
if meta and 'source' in meta:
|
||||||
|
excluded_files.add(Path(meta['source']).name)
|
||||||
|
|
||||||
|
console.print(f"\n[bold red]❌ Excluded Files ({len(excluded_files)}):[/bold red]")
|
||||||
|
console.print("=" * 50, style="dim")
|
||||||
|
|
||||||
|
for filename in sorted(excluded_files):
|
||||||
|
console.print(f" • {filename}", style="red")
|
||||||
|
|
||||||
|
console.print("=" * 50, style="dim")
|
||||||
|
console.print(f"[dim]Total chunks excluded: {len(excluded_data['metadatas'])}[/dim]\n")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]✗ Error fetching excluded files: {e}[/red]")
|
||||||
|
|
||||||
|
async def show_learning_stats(self):
|
||||||
|
"""Display comprehensive learning statistics"""
|
||||||
|
console.print("\n[bold cyan]📊 Learning Statistics[/bold cyan]")
|
||||||
|
console.print("=" * 60, style="dim")
|
||||||
|
|
||||||
|
stats = await self.analytics.get_learning_stats()
|
||||||
|
|
||||||
|
# Display stats in a table
|
||||||
|
table = Table(title="Knowledge Overview", show_header=False)
|
||||||
|
table.add_column("Metric", style="cyan")
|
||||||
|
table.add_column("Value", style="yellow")
|
||||||
|
|
||||||
|
table.add_row("Total Topics Studied", str(stats["total_topics"]))
|
||||||
|
table.add_row("Total Notes Created", str(stats["total_notes"]))
|
||||||
|
table.add_row("Total Files", str(stats["total_files"]))
|
||||||
|
table.add_row("Study Streak (days)", str(stats["study_streak"]))
|
||||||
|
table.add_row("Most Productive Topic", stats["most_productive_topic"])
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Show topics
|
||||||
|
if stats["topics"]:
|
||||||
|
console.print(f"\n[bold green]📚 Topics Studied:[/bold green]")
|
||||||
|
for topic in sorted(stats["topics"]):
|
||||||
|
console.print(f" ✓ {topic}")
|
||||||
|
|
||||||
|
console.print()
|
||||||
|
|
||||||
|
async def interactive_learning_mode(self):
|
||||||
|
"""Start interactive learning mode"""
|
||||||
|
console.print("\n[bold magenta]🎓 Interactive Learning Mode[/bold magenta]")
|
||||||
|
console.print("I'll analyze your current knowledge and suggest what to learn next!\n")
|
||||||
|
|
||||||
|
# First, analyze current knowledge
|
||||||
|
console.print("[cyan]Analyzing your current knowledge base...[/cyan]")
|
||||||
|
|
||||||
|
# Get analysis
|
||||||
|
db_data = await asyncio.to_thread(self.vectorstore.get)
|
||||||
|
all_texts = db_data['documents']
|
||||||
|
all_metadatas = db_data['metadatas']
|
||||||
|
|
||||||
|
# Filter excluded
|
||||||
|
filtered_pairs = [
|
||||||
|
(text, meta) for text, meta in zip(all_texts, all_metadatas)
|
||||||
|
if meta and not meta.get('exclude', False)
|
||||||
|
]
|
||||||
|
|
||||||
|
if not filtered_pairs:
|
||||||
|
console.print("[yellow]⚠️ No learning materials found. Add some notes first![/yellow]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build context for analysis
|
||||||
|
full_context = ""
|
||||||
|
for text, meta in filtered_pairs[:20]: # Limit context
|
||||||
|
full_context += f"\n---\nSource: {Path(meta['source']).name}\n{text}\n"
|
||||||
|
|
||||||
|
# Get AI analysis
|
||||||
|
chain = get_chain(SYSTEM_PROMPT_ANALYSIS)
|
||||||
|
|
||||||
|
console.print("[cyan]Getting AI analysis of your progress...[/cyan]")
|
||||||
|
analysis_response = ""
|
||||||
|
async for chunk in chain.astream({
|
||||||
|
"context": full_context,
|
||||||
|
"question": "Analyze my learning progress and identify what I've learned well and what gaps exist.",
|
||||||
|
"history": ""
|
||||||
|
}):
|
||||||
|
analysis_response += chunk
|
||||||
|
|
||||||
|
console.print(f"\n[bold green]📈 Your Learning Analysis:[/bold green]")
|
||||||
|
console.print(analysis_response)
|
||||||
|
|
||||||
|
# Get suggestions
|
||||||
|
console.print("\n[cyan]Generating personalized learning suggestions...[/cyan]")
|
||||||
|
|
||||||
|
suggestion_chain = get_chain(SYSTEM_PROMPT_SUGGESTION)
|
||||||
|
suggestion_response = ""
|
||||||
|
async for chunk in suggestion_chain.astream({
|
||||||
|
"context": full_context,
|
||||||
|
"question": "Based on this student's current knowledge, what should they learn next?",
|
||||||
|
"history": ""
|
||||||
|
}):
|
||||||
|
suggestion_response += chunk
|
||||||
|
|
||||||
|
console.print(f"\n[bold blue]💡 Recommended Next Topics:[/bold blue]")
|
||||||
|
console.print(suggestion_response)
|
||||||
|
|
||||||
|
# Save progress
|
||||||
|
progress_cache = load_progress_cache()
|
||||||
|
if "sessions" not in progress_cache:
|
||||||
|
progress_cache["sessions"] = []
|
||||||
|
|
||||||
|
progress_cache["sessions"].append({
|
||||||
|
"date": datetime.now().isoformat(),
|
||||||
|
"type": "analysis",
|
||||||
|
"topics_count": len(filtered_pairs)
|
||||||
|
})
|
||||||
|
|
||||||
|
save_progress_cache(progress_cache)
|
||||||
|
|
||||||
|
console.print(f"\n[green]✓ Analysis complete! Add notes about the suggested topics and run 'learning mode' again.[/green]")
|
||||||
|
|
||||||
|
async def suggest_topics(self):
|
||||||
|
"""Suggest new topics to learn"""
|
||||||
|
console.print("\n[bold blue]💡 Topic Suggestions[/bold blue]")
|
||||||
|
|
||||||
|
# Get current knowledge
|
||||||
|
db_data = await asyncio.to_thread(self.vectorstore.get)
|
||||||
|
all_texts = db_data['documents']
|
||||||
|
all_metadatas = db_data['metadatas']
|
||||||
|
|
||||||
|
filtered_pairs = [
|
||||||
|
(text, meta) for text, meta in zip(all_texts, all_metadatas)
|
||||||
|
if meta and not meta.get('exclude', False)
|
||||||
|
][:15] # Limit context
|
||||||
|
|
||||||
|
if not filtered_pairs:
|
||||||
|
console.print("[yellow]⚠️ No notes found. Start by creating some learning materials![/yellow]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build context
|
||||||
|
context = ""
|
||||||
|
for text, meta in filtered_pairs:
|
||||||
|
context += f"\n---\nSource: {Path(meta['source']).name}\n{text}\n"
|
||||||
|
|
||||||
|
# Get suggestions from AI
|
||||||
|
chain = get_chain(SYSTEM_PROMPT_SUGGESTION)
|
||||||
|
|
||||||
|
console.print("[cyan]Analyzing your knowledge and generating suggestions...[/cyan]\n")
|
||||||
|
|
||||||
|
response = ""
|
||||||
|
async for chunk in chain.astream({
|
||||||
|
"context": context,
|
||||||
|
"question": "What are the next logical topics for this student to learn?",
|
||||||
|
"history": ""
|
||||||
|
}):
|
||||||
|
response += chunk
|
||||||
|
console.print(chunk, end="")
|
||||||
|
|
||||||
|
console.print("\n")
|
||||||
|
|
||||||
|
async def exclude_file_interactive(self):
|
||||||
|
"""Interactively exclude a file from learning analysis"""
|
||||||
|
console.print("\n[bold yellow]📁 Exclude File from Analysis[/bold yellow]")
|
||||||
|
|
||||||
|
# List all non-excluded files
|
||||||
|
db_data = await asyncio.to_thread(self.vectorstore.get)
|
||||||
|
files = set()
|
||||||
|
|
||||||
|
for meta in db_data['metadatas']:
|
||||||
|
if meta and 'source' in meta and not meta.get('exclude', False):
|
||||||
|
files.add(meta['source'])
|
||||||
|
|
||||||
|
if not files:
|
||||||
|
console.print("[yellow]⚠️ No files found to exclude.[/yellow]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Show files
|
||||||
|
file_list = sorted(list(files))
|
||||||
|
console.print("\n[bold]Available files:[/bold]")
|
||||||
|
for i, file_path in enumerate(file_list, 1):
|
||||||
|
console.print(f" {i}. {Path(file_path).name}")
|
||||||
|
|
||||||
|
# Get user choice
|
||||||
|
choice = Prompt.ask("\nSelect file number to exclude",
|
||||||
|
choices=[str(i) for i in range(1, len(file_list) + 1)],
|
||||||
|
default="1")
|
||||||
|
|
||||||
|
selected_file = file_list[int(choice) - 1]
|
||||||
|
|
||||||
|
# Confirmation
|
||||||
|
if Confirm.ask(f"\nExclude '{Path(selected_file).name}' from learning analysis?"):
|
||||||
|
# Update the file's metadata in vectorstore
|
||||||
|
try:
|
||||||
|
# Note: In a real implementation, you'd need to update the file's frontmatter
|
||||||
|
# For now, we'll show instructions
|
||||||
|
console.print(f"\n[red]⚠️ Manual action required:[/red]")
|
||||||
|
console.print(f"Add 'exclude: true' to the frontmatter of:")
|
||||||
|
console.print(f" {selected_file}")
|
||||||
|
console.print(f"\n[dim]Example:[/dim]")
|
||||||
|
console.print("```\n---\nexclude: true\n---\n```")
|
||||||
|
console.print(f"\n[green]The file will be excluded on next reindex.[/green]")
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]✗ Error: {e}[/red]")
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# MAIN APPLICATION
|
||||||
# =========================
|
# =========================
|
||||||
async def main():
|
async def main():
|
||||||
|
"""Main application entry point"""
|
||||||
|
|
||||||
|
# Setup directories
|
||||||
Path(MD_DIRECTORY).mkdir(parents=True, exist_ok=True)
|
Path(MD_DIRECTORY).mkdir(parents=True, exist_ok=True)
|
||||||
Path(CHROMA_PATH).parent.mkdir(parents=True, exist_ok=True)
|
Path(CHROMA_PATH).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Display welcome banner
|
||||||
console.print(Panel.fit(
|
console.print(Panel.fit(
|
||||||
f"[bold cyan]⚡ Dual-Mode RAG System[/bold cyan]\n"
|
f"[bold cyan]⚡ RAG Learning System[/bold cyan]\n"
|
||||||
f"📂 Docs: {MD_DIRECTORY}\n"
|
f"📂 Notes Directory: {MD_DIRECTORY}\n"
|
||||||
f"🧠 Embed: {EMBEDDING_MODEL}\n"
|
f"🧠 Embedding Model: {EMBEDDING_MODEL}\n"
|
||||||
f"🤖 LLM: {LLM_MODEL}",
|
f"🤖 LLM Model: {LLM_MODEL}\n"
|
||||||
|
f"[dim]Commands: /help for available commands[/dim]",
|
||||||
border_style="cyan"
|
border_style="cyan"
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# Initialize components
|
||||||
embeddings = OllamaEmbeddings(
|
embeddings = OllamaEmbeddings(
|
||||||
model=EMBEDDING_MODEL,
|
model=EMBEDDING_MODEL,
|
||||||
base_url=OLLAMA_BASE_URL
|
base_url=OLLAMA_BASE_URL
|
||||||
)
|
)
|
||||||
|
|
||||||
vectorstore = Chroma(
|
vectorstore = Chroma(
|
||||||
collection_name=COLLECTION_NAME,
|
collection_name=COLLECTION_NAME,
|
||||||
persist_directory=CHROMA_PATH,
|
persist_directory=CHROMA_PATH,
|
||||||
@@ -285,9 +791,14 @@ async def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
processor = ChunkProcessor(vectorstore)
|
processor = ChunkProcessor(vectorstore)
|
||||||
|
analytics = LearningAnalytics(vectorstore)
|
||||||
|
commands = InteractiveCommands(vectorstore, analytics)
|
||||||
|
|
||||||
cache = load_hash_cache()
|
cache = load_hash_cache()
|
||||||
|
|
||||||
# Checking documents
|
# Index existing documents
|
||||||
|
console.print(f"\n[bold yellow]📚 Indexing documents...[/bold yellow]")
|
||||||
|
|
||||||
files = [
|
files = [
|
||||||
os.path.join(root, file)
|
os.path.join(root, file)
|
||||||
for root, _, files in os.walk(MD_DIRECTORY)
|
for root, _, files in os.walk(MD_DIRECTORY)
|
||||||
@@ -299,49 +810,106 @@ async def main():
|
|||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await processor.index_file(fp, cache)
|
return await processor.index_file(fp, cache)
|
||||||
|
|
||||||
|
# Use progress bar for indexing
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
console=console
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Indexing files...", total=len(files))
|
||||||
|
|
||||||
tasks = [sem_task(fp) for fp in files]
|
tasks = [sem_task(fp) for fp in files]
|
||||||
for fut in asyncio.as_completed(tasks):
|
for fut in asyncio.as_completed(tasks):
|
||||||
await fut
|
await fut
|
||||||
|
progress.advance(task)
|
||||||
|
|
||||||
save_hash_cache(cache)
|
save_hash_cache(cache)
|
||||||
|
|
||||||
|
# Start file watcher
|
||||||
observer = start_watcher(processor, cache)
|
observer = start_watcher(processor, cache)
|
||||||
memory = ConversationMemory()
|
memory = ConversationMemory()
|
||||||
|
|
||||||
|
# Show help hint
|
||||||
|
console.print(f"\n[dim]💡 Type /help to see available commands[/dim]\n")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
|
# Get user input
|
||||||
query = await session.prompt_async("> ", style=style)
|
query = await session.prompt_async("> ", style=style)
|
||||||
query = query.strip()
|
query = query.strip()
|
||||||
if query.lower() in {"exit", "quit", "q"}:
|
|
||||||
console.print("\nGoodbye!", style="yellow")
|
if not query:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle commands
|
||||||
|
if query.startswith('/'):
|
||||||
|
command = query[1:].lower().strip()
|
||||||
|
|
||||||
|
if command in ['exit', 'quit', 'q']:
|
||||||
|
console.print("\n👋 Goodbye!", style="yellow")
|
||||||
break
|
break
|
||||||
if not query: continue
|
|
||||||
|
|
||||||
|
elif command in ['help', 'h']:
|
||||||
|
await show_help()
|
||||||
|
|
||||||
|
elif command in ['stats', 'statistics']:
|
||||||
|
await commands.show_learning_stats()
|
||||||
|
|
||||||
|
elif command in ['excluded', 'list-excluded']:
|
||||||
|
await commands.list_excluded_files()
|
||||||
|
|
||||||
|
elif command in ['learning-mode', 'learn']:
|
||||||
|
await commands.interactive_learning_mode()
|
||||||
|
|
||||||
|
elif command in ['suggest', 'suggestions']:
|
||||||
|
await commands.suggest_topics()
|
||||||
|
|
||||||
|
elif command in ['exclude']:
|
||||||
|
await commands.exclude_file_interactive()
|
||||||
|
|
||||||
|
elif command in ['reindex']:
|
||||||
|
console.print("\n[yellow]🔄 Reindexing all files...[/yellow]")
|
||||||
|
cache.clear()
|
||||||
|
for file_path in files:
|
||||||
|
await processor.index_file(file_path, cache)
|
||||||
|
save_hash_cache(cache)
|
||||||
|
console.print("[green]✓ Reindexing complete![/green]")
|
||||||
|
|
||||||
|
else:
|
||||||
|
console.print(f"[red]✗ Unknown command: {command}[/red]")
|
||||||
|
console.print("[dim]Type /help to see available commands[/dim]")
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Process normal queries
|
||||||
console.print()
|
console.print()
|
||||||
|
|
||||||
mode = classify_intent(query)
|
mode = classify_intent(query)
|
||||||
history_str = memory.get_history()
|
history_str = memory.get_history()
|
||||||
|
|
||||||
if mode == "SEARCH":
|
if mode == "SEARCH":
|
||||||
console.print("🔍 SEARCH MODE (Top-K)", style="bold blue")
|
console.print("🔍 SEARCH MODE (Top-K Retrieval)", style="bold blue")
|
||||||
|
|
||||||
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
|
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
|
||||||
docs = await asyncio.to_thread(retriever.invoke, query)
|
docs = await asyncio.to_thread(retriever.invoke, query)
|
||||||
context_str = "\n\n".join(f"[{Path(d.metadata['source']).name}]\n{d.page_content}" for d in docs)
|
context_str = "\n\n".join(
|
||||||
|
f"[{Path(d.metadata['source']).name}]\n{d.page_content}"
|
||||||
|
for d in docs
|
||||||
|
)
|
||||||
|
|
||||||
chain = get_chain(SYSTEM_PROMPT_SEARCH)
|
chain = get_chain(SYSTEM_PROMPT_SEARCH)
|
||||||
|
|
||||||
else: # ANALYSIS MODE
|
elif mode == "ANALYSIS":
|
||||||
console.print("📊 ANALYSIS MODE (Full Context)", style="bold magenta")
|
console.print("📊 ANALYSIS MODE (Full Context Evaluation)", style="bold magenta")
|
||||||
|
|
||||||
db_data = await asyncio.to_thread(vectorstore.get)
|
db_data = await asyncio.to_thread(vectorstore.get)
|
||||||
all_texts = db_data['documents']
|
all_texts = db_data['documents']
|
||||||
all_metas = db_data['metadatas']
|
all_metas = db_data['metadatas']
|
||||||
|
|
||||||
if not all_texts:
|
if not all_texts:
|
||||||
console.print("No documents found to analyze!", style="red")
|
console.print("[red]No documents found to analyze![/red]")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Exclude chunks where metadata has exclude: true
|
# Filter excluded chunks
|
||||||
filtered_pairs = [
|
filtered_pairs = [
|
||||||
(text, meta) for text, meta in zip(all_texts, all_metas)
|
(text, meta) for text, meta in zip(all_texts, all_metas)
|
||||||
if meta and not meta.get('exclude', False)
|
if meta and not meta.get('exclude', False)
|
||||||
@@ -352,15 +920,14 @@ async def main():
|
|||||||
console.print(f"ℹ Excluded {excluded_count} chunks marked 'exclude: true'", style="dim")
|
console.print(f"ℹ Excluded {excluded_count} chunks marked 'exclude: true'", style="dim")
|
||||||
|
|
||||||
if not filtered_pairs:
|
if not filtered_pairs:
|
||||||
console.print("All documents are marked for exclusion. Nothing to analyze.", style="yellow")
|
console.print("[yellow]All documents are marked for exclusion. Nothing to analyze.[/yellow]")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Build context
|
||||||
full_context = ""
|
full_context = ""
|
||||||
char_count = 0
|
char_count = 0
|
||||||
|
|
||||||
paired = sorted(filtered_pairs, key=lambda x: x[1]['source'])
|
for text, meta in filtered_pairs[:25]: # Limit for analysis
|
||||||
|
|
||||||
for text, meta in paired:
|
|
||||||
entry = f"\n---\nSource: {Path(meta['source']).name}\n{text}\n"
|
entry = f"\n---\nSource: {Path(meta['source']).name}\n{text}\n"
|
||||||
if char_count + len(entry) > MAX_ANALYSIS_CONTEXT_CHARS:
|
if char_count + len(entry) > MAX_ANALYSIS_CONTEXT_CHARS:
|
||||||
full_context += "\n[...Truncated due to context limit...]"
|
full_context += "\n[...Truncated due to context limit...]"
|
||||||
@@ -372,6 +939,19 @@ async def main():
|
|||||||
context_str = full_context
|
context_str = full_context
|
||||||
chain = get_chain(SYSTEM_PROMPT_ANALYSIS)
|
chain = get_chain(SYSTEM_PROMPT_ANALYSIS)
|
||||||
|
|
||||||
|
elif mode == "SUGGEST":
|
||||||
|
await commands.suggest_topics()
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif mode == "STATS":
|
||||||
|
await commands.show_learning_stats()
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif mode == "LEARN":
|
||||||
|
await commands.interactive_learning_mode()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Generate and display response
|
||||||
response = ""
|
response = ""
|
||||||
console.print(f"Context size: {len(context_str)} chars", style="dim")
|
console.print(f"Context size: {len(context_str)} chars", style="dim")
|
||||||
console.print("Assistant:", style="blue", end=" ")
|
console.print("Assistant:", style="blue", end=" ")
|
||||||
@@ -385,20 +965,60 @@ async def main():
|
|||||||
response += chunk
|
response += chunk
|
||||||
console.print("\n")
|
console.print("\n")
|
||||||
|
|
||||||
|
# Update conversation memory
|
||||||
memory.add("user", query)
|
memory.add("user", query)
|
||||||
memory.add("assistant", response)
|
memory.add("assistant", response)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
# Cleanup
|
||||||
observer.stop()
|
observer.stop()
|
||||||
observer.join()
|
observer.join()
|
||||||
|
|
||||||
|
async def show_help():
|
||||||
|
"""Display help information"""
|
||||||
|
console.print("\n[bold cyan]📖 Available Commands:[/bold cyan]")
|
||||||
|
console.print("=" * 50, style="dim")
|
||||||
|
|
||||||
|
commands = [
|
||||||
|
("/help", "Show this help message"),
|
||||||
|
("/stats", "Display learning statistics and progress"),
|
||||||
|
("/learning-mode", "Start interactive learning analysis"),
|
||||||
|
("/suggest", "Get topic suggestions for next study"),
|
||||||
|
("/excluded", "List files excluded from analysis"),
|
||||||
|
("/exclude", "Interactively exclude a file"),
|
||||||
|
("/reindex", "Reindex all documents"),
|
||||||
|
("/exit, /quit, /q", "Exit the application"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for cmd, desc in commands:
|
||||||
|
console.print(f"[yellow]{cmd:<20}[/yellow] {desc}")
|
||||||
|
|
||||||
|
console.print("\n[bold cyan]🎯 Learning Modes:[/bold cyan]")
|
||||||
|
console.print("=" * 50, style="dim")
|
||||||
|
console.print("• [blue]Search Mode[/blue]: Ask questions about your notes")
|
||||||
|
console.print("• [magenta]Analysis Mode[/magenta]: Get progress evaluation")
|
||||||
|
console.print("• [green]Suggestion Mode[/green]: Get topic recommendations")
|
||||||
|
|
||||||
|
console.print("\n[bold cyan]💡 Examples:[/bold cyan]")
|
||||||
|
console.print("=" * 50, style="dim")
|
||||||
|
console.print("• \"What is SQL JOIN?\" → Search your notes")
|
||||||
|
console.print("• \"Assess my progress\" → Analyze learning")
|
||||||
|
console.print("• \"What should I learn next?\" → Get suggestions")
|
||||||
|
console.print("• \"Show my statistics\" → Display progress")
|
||||||
|
|
||||||
|
console.print()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import nest_asyncio
|
import nest_asyncio
|
||||||
nest_asyncio.apply()
|
nest_asyncio.apply()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import asyncio
|
import asyncio
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
loop.run_until_complete(main())
|
loop.run_until_complete(main())
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
console.print("\nGoodbye!", style="yellow")
|
console.print("\n👋 Goodbye!", style="yellow")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"\n[red]✗ Unexpected error: {e}[/red]")
|
||||||
|
sys.exit(1)
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ dependencies = [
|
|||||||
"nest-asyncio>=1.6.0",
|
"nest-asyncio>=1.6.0",
|
||||||
"prompt-toolkit>=3.0.52",
|
"prompt-toolkit>=3.0.52",
|
||||||
"python-dotenv>=1.2.1",
|
"python-dotenv>=1.2.1",
|
||||||
|
"pyyaml>=6.0.3",
|
||||||
"rich>=14.2.0",
|
"rich>=14.2.0",
|
||||||
"unstructured[md]>=0.18.21",
|
"unstructured[md]>=0.18.21",
|
||||||
"watchdog>=6.0.0",
|
"watchdog>=6.0.0",
|
||||||
|
|||||||
2
uv.lock
generated
2
uv.lock
generated
@@ -2100,6 +2100,7 @@ dependencies = [
|
|||||||
{ name = "nest-asyncio" },
|
{ name = "nest-asyncio" },
|
||||||
{ name = "prompt-toolkit" },
|
{ name = "prompt-toolkit" },
|
||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
|
{ name = "pyyaml" },
|
||||||
{ name = "rich" },
|
{ name = "rich" },
|
||||||
{ name = "unstructured", extra = ["md"] },
|
{ name = "unstructured", extra = ["md"] },
|
||||||
{ name = "watchdog" },
|
{ name = "watchdog" },
|
||||||
@@ -2114,6 +2115,7 @@ requires-dist = [
|
|||||||
{ name = "nest-asyncio", specifier = ">=1.6.0" },
|
{ name = "nest-asyncio", specifier = ">=1.6.0" },
|
||||||
{ name = "prompt-toolkit", specifier = ">=3.0.52" },
|
{ name = "prompt-toolkit", specifier = ">=3.0.52" },
|
||||||
{ name = "python-dotenv", specifier = ">=1.2.1" },
|
{ name = "python-dotenv", specifier = ">=1.2.1" },
|
||||||
|
{ name = "pyyaml", specifier = ">=6.0.3" },
|
||||||
{ name = "rich", specifier = ">=14.2.0" },
|
{ name = "rich", specifier = ">=14.2.0" },
|
||||||
{ name = "unstructured", extras = ["md"], specifier = ">=0.18.21" },
|
{ name = "unstructured", extras = ["md"], specifier = ">=0.18.21" },
|
||||||
{ name = "watchdog", specifier = ">=6.0.0" },
|
{ name = "watchdog", specifier = ">=6.0.0" },
|
||||||
|
|||||||
Reference in New Issue
Block a user