Initial commit

2025-12-29 18:29:35 +03:00
commit 0599dbcdf4
7 changed files with 3693 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,3 @@
+MD_FOLDER=my_docs
+EMBEDDING_MODEL=mxbai-embed-large:latest
+LLM_MODEL=qwen2.5:7b-instruct-q8_0
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,15 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+
+#
+.env
+.cache/
+my_docs/
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
+3.13
--- a/README.md
+++ b/README.md
@@ -0,0 +1,22 @@
+# Local RAG System for Markdown Files
+
+## Requirements
+
+1. Install [Ollama](https://ollama.com/download/)
+2. Pull required models:
+
+```bash
+ollama pull qwen2.5:7b-instruct-q8_0
+ollama pull mxbai-embed-large:latest
+```
+
+You can use any model but update model names in `.env`
+
+## Run
+
+```bash
+cp .env.example .env
+uv sync
+uv run main.py
+```
+
--- a/main.py
+++ b/main.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+import hashlib
+import asyncio
+from pathlib import Path
+from collections import deque
+from typing import List, Dict
+
+import torch
+from dotenv import load_dotenv
+from rich.console import Console
+from rich.panel import Panel
+from prompt_toolkit import PromptSession
+from prompt_toolkit.styles import Style
+from prompt_toolkit.patch_stdout import patch_stdout
+
+from langchain_community.document_loaders import UnstructuredMarkdownLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_ollama import OllamaEmbeddings, ChatOllama
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler
+
+# =========================
+# CONFIG
+# =========================
+console = Console()
+session = PromptSession()
+load_dotenv()
+
+style = Style.from_dict({"prompt": "bold #6a0dad"})
+
+MD_DIRECTORY = os.getenv("MD_FOLDER")
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
+LLM_MODEL = os.getenv("LLM_MODEL")
+
+CHROMA_PATH = "./.cache/chroma_db"
+HASH_CACHE = "./.cache/file_hashes.json"
+
+MAX_EMBED_CHARS = 380   
+CHUNK_SIZE = 1200  
+CHUNK_OVERLAP = 200
+TOP_K = 6
+COLLECTION_NAME = "md_rag"
+
+BATCH_SIZE = 10  
+MAX_PARALLEL_FILES = 3
+
+# =========================
+# GPU SETUP
+# =========================
+def setup_gpu():
+    if torch.cuda.is_available():
+        torch.cuda.set_per_process_memory_fraction(0.95)
+    else:
+        console.print("[yellow]⚠ CPU mode[/yellow]")
+
+setup_gpu()
+
+# =========================
+# HASH CACHE
+# =========================
+def get_file_hash(file_path: str) -> str:
+    return hashlib.md5(Path(file_path).read_bytes()).hexdigest()
+
+def load_hash_cache() -> dict:
+    Path(HASH_CACHE).parent.mkdir(parents=True, exist_ok=True)
+    if Path(HASH_CACHE).exists():
+        return json.loads(Path(HASH_CACHE).read_text())
+    return {}
+
+def save_hash_cache(cache: dict):
+    Path(HASH_CACHE).write_text(json.dumps(cache, indent=2))
+
+# =========================
+# CHUNK VALIDATION
+# =========================
+def validate_chunk_size(text: str, max_chars: int = MAX_EMBED_CHARS) -> List[str]:
+    if len(text) <= max_chars:
+        return [text]
+    
+    sentences = text.replace('. ', '.|').replace('! ', '!|').replace('? ', '?|').split('|')
+    chunks = []
+    current = ""
+    
+    for sentence in sentences:
+        if len(current) + len(sentence) <= max_chars:
+            current += sentence
+        else:
+            if current:
+                chunks.append(current.strip())
+            if len(sentence) > max_chars:
+                words = sentence.split()
+                temp = ""
+                for word in words:
+                    if len(temp) + len(word) + 1 <= max_chars:
+                        temp += word + " "
+                    else:
+                        if temp:
+                            chunks.append(temp.strip())
+                        temp = word + " "
+                if temp:
+                    chunks.append(temp.strip())
+            else:
+                current = sentence
+    
+    if current:
+        chunks.append(current.strip())
+    
+    return [c for c in chunks if c]
+
+# =========================
+# DOCUMENT PROCESSING
+# =========================
+class ChunkProcessor:
+    def __init__(self, vectorstore):
+        self.vectorstore = vectorstore
+        self.semaphore = asyncio.Semaphore(MAX_PARALLEL_FILES)
+        
+    async def process_file(self, file_path: str) -> List[Dict]:
+        try:
+            docs = await asyncio.to_thread(
+                UnstructuredMarkdownLoader(file_path).load
+            )
+        except Exception as e:
+            console.print(f"[red]✗ {Path(file_path).name}: {e}[/red]")
+            return []
+
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            separators=["\n\n", "\n", ". ", " "]
+        )
+
+        chunks = []
+        for doc_idx, doc in enumerate(docs):
+            for chunk_idx, text in enumerate(splitter.split_text(doc.page_content)):
+                safe_texts = validate_chunk_size(text)
+                for sub_idx, safe_text in enumerate(safe_texts):
+                    chunks.append({
+                        "id": f"{file_path}::{doc_idx}::{chunk_idx}::{sub_idx}",
+                        "text": safe_text,
+                        "metadata": {"source": file_path, **doc.metadata}
+                    })
+        return chunks
+
+    async def embed_batch(self, batch: List[Dict]) -> bool:
+        if not batch:
+            return True
+            
+        try:
+            docs = [Document(page_content=c["text"], metadata=c["metadata"]) 
+                    for c in batch]
+            ids = [c["id"] for c in batch]
+            
+            await asyncio.to_thread(
+                self.vectorstore.add_documents,
+                docs,
+                ids=ids
+            )
+            return True
+            
+        except Exception as e:
+            error_msg = str(e).lower()
+            if "context length" in error_msg or "input length" in error_msg:
+                console.print(f"[yellow]⚠ Oversized chunk detected, processing individually[/yellow]")
+                for item in batch:
+                    try:
+                        doc = Document(page_content=item["text"], metadata=item["metadata"])
+                        await asyncio.to_thread(
+                            self.vectorstore.add_documents,
+                            [doc],
+                            ids=[item["id"]]
+                        )
+                    except Exception:
+                        console.print(f"[red]✗ Skipping chunk (too large): {len(item['text'])} chars[/red]")
+                        continue
+                return True
+            else:
+                console.print(f"[red]✗ Embed error: {e}[/red]")
+                return False
+
+    async def index_file(self, file_path: str, cache: dict) -> bool:
+        async with self.semaphore:
+            current_hash = get_file_hash(file_path)
+            if cache.get(file_path) == current_hash:
+                return False
+
+            chunks = await self.process_file(file_path)
+            if not chunks:
+                return False
+
+            for i in range(0, len(chunks), BATCH_SIZE):
+                batch = chunks[i:i + BATCH_SIZE]
+                success = await self.embed_batch(batch)
+                if not success:
+                    console.print(f"[yellow]⚠ Partial failure in {Path(file_path).name}[/yellow]")
+
+            cache[file_path] = current_hash
+            console.print(f"[green]✓ {Path(file_path).name} ({len(chunks)} chunks)[/green]")
+            return True
+
+# =========================
+# FILE WATCHER
+# =========================
+class DocumentWatcher(FileSystemEventHandler):
+    def __init__(self, processor, cache):
+        self.processor = processor
+        self.cache = cache
+        self.queue = deque()
+        self.processing = False
+
+    def on_modified(self, event):
+        if not event.is_directory and event.src_path.endswith(".md"):
+            self.queue.append(event.src_path)
+
+    async def process_queue(self):
+        while True:
+            if self.queue and not self.processing:
+                self.processing = True
+                file_path = self.queue.popleft()
+                if Path(file_path).exists():
+                    await self.processor.index_file(file_path, self.cache)
+                    save_hash_cache(self.cache)
+                self.processing = False
+            await asyncio.sleep(1)
+
+def start_watcher(processor, cache):
+    handler = DocumentWatcher(processor, cache)
+    observer = Observer()
+    observer.schedule(handler, MD_DIRECTORY, recursive=True)
+    observer.start()
+    asyncio.create_task(handler.process_queue())
+    return observer
+
+# =========================
+# RAG CHAIN & MEMORY
+# =========================
+class ConversationMemory:
+    def __init__(self, max_messages: int = 8):
+        self.messages = []
+        self.max_messages = max_messages
+
+    def add(self, role: str, content: str):
+        self.messages.append({"role": role, "content": content})
+        if len(self.messages) > self.max_messages:
+            self.messages.pop(0)
+    
+    def get_history(self) -> str:
+        if not self.messages:
+            return "No previous conversation."
+        return "\n".join([f"{m['role'].upper()}: {m['content']}" for m in self.messages])
+
+def get_rag_components(retriever):
+    llm = ChatOllama(model=LLM_MODEL, temperature=0.1)
+    
+    # FIX 1: Added {history} to the prompt
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a precise technical assistant. Cite sources using [filename]. Be concise."),
+        ("human", "Previous Conversation:\n{history}\n\nContext from Docs:\n{context}\n\nCurrent Question: {question}")
+    ])
+    
+    return prompt | llm | StrOutputParser()
+
+# =========================
+# MAIN
+# =========================
+async def main():
+    Path(MD_DIRECTORY).mkdir(parents=True, exist_ok=True)
+    Path(CHROMA_PATH).parent.mkdir(parents=True, exist_ok=True)
+
+    console.print(Panel.fit(
+        f"[bold cyan]⚡ RAG System[/bold cyan]\n"
+        f"📂 Docs: {MD_DIRECTORY}\n"
+        f"🧠 Embed: {EMBEDDING_MODEL}\n"
+        f"🤖 LLM: {LLM_MODEL}",
+        border_style="cyan"
+    ))
+
+    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)
+    vectorstore = Chroma(
+        collection_name=COLLECTION_NAME,
+        persist_directory=CHROMA_PATH,
+        embedding_function=embeddings
+    )
+
+    processor = ChunkProcessor(vectorstore)
+    cache = load_hash_cache()
+
+    console.print("\n[yellow]Indexing documents...[/yellow]")
+    files = [
+        os.path.join(root, file)
+        for root, _, files in os.walk(MD_DIRECTORY)
+        for file in files if file.endswith(".md")
+    ]
+
+    semaphore = asyncio.Semaphore(MAX_PARALLEL_FILES)
+    async def sem_task(fp):
+        async with semaphore:
+            return await processor.index_file(fp, cache)
+
+    tasks = [sem_task(fp) for fp in files]
+    for fut in asyncio.as_completed(tasks):
+        await fut
+    save_hash_cache(cache)
+
+    console.print(f"[green]✓ Processed {len(files)} files[/green]\n")
+
+    observer = start_watcher(processor, cache)
+    
+    retriever = vectorstore.as_retriever(
+        search_type="similarity",
+        search_kwargs={"k": TOP_K}
+    )
+
+    rag_chain = get_rag_components(retriever)
+    memory = ConversationMemory()
+
+    console.print("[bold green]💬 Ready![/bold green]\n")
+
+    try:
+        with patch_stdout():
+            while True:
+                query = await session.prompt_async("> ", style=style)
+                query = query.strip()
+                if query.lower() in {"exit", "quit", "q"}:
+                    print("Goodbye!")
+                    break
+                if not query:
+                    continue
+
+                docs = await asyncio.to_thread(retriever.invoke, query)
+                context_str = "\n\n".join(f"[{Path(d.metadata['source']).name}]\n{d.page_content}" for d in docs)
+                history_str = memory.get_history()
+
+                response = ""
+                async for chunk in rag_chain.astream({
+                    "context": context_str, 
+                    "question": query,
+                    "history": history_str
+                }):
+                    print(chunk, end="")
+                    response += chunk
+                console.print("\n")
+
+                memory.add("user", query)
+                memory.add("assistant", response)
+
+    finally:
+        observer.stop()
+        observer.join()
+
+if __name__ == "__main__":
+    import nest_asyncio
+    nest_asyncio.apply()
+    try:
+        import asyncio
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(main())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Goodbye![/yellow]")
+        sys.exit(0)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "rag-llm"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "chromadb>=1.4.0",
+    "langchain-chroma>=1.1.0",
+    "langchain-community>=0.4.1",
+    "langchain-ollama>=1.0.1",
+    "nest-asyncio>=1.6.0",
+    "prompt-toolkit>=3.0.52",
+    "python-dotenv>=1.2.1",
+    "rich>=14.2.0",
+    "torch>=2.9.1",
+    "unstructured[md]>=0.18.21",
+    "watchdog>=6.0.0",
+]
--- a/uv.lock
+++ b/uv.lock