From 0e4e438cbc8a08881ca35d7cc86914b33557f734 Mon Sep 17 00:00:00 2001
From: y9938 <nbla6302@gmail.com>
Date: Tue, 30 Dec 2025 02:30:08 +0300
Subject: [PATCH] feat: system/user prompts in .env

---
 .env.example | 10 ++++++++++
 main.py      | 21 ++++++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/.env.example b/.env.example
index a2caab6..af6f481 100644
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,13 @@
 MD_FOLDER=my_docs
 EMBEDDING_MODEL=mxbai-embed-large:latest
 LLM_MODEL=qwen2.5:7b-instruct-q8_0
+
+SYSTEM_PROMPT="You are a precise technical assistant. Cite sources using [filename]. Be concise."
+
+USER_PROMPT_TEMPLATE="Previous Conversation:
+{history}
+
+Context from Docs:
+{context}
+
+Current Question: {question}"
diff --git a/main.py b/main.py
index 19873be..3c169dc 100644
--- a/main.py
+++ b/main.py
@@ -36,6 +36,10 @@ load_dotenv()
 
 style = Style.from_dict({"prompt": "bold #6a0dad"})
 
+SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "You are a precise technical assistant. Cite sources using [filename]. Be concise.")
+USER_PROMPT_TEMPLATE = os.getenv("USER_PROMPT_TEMPLATE", 
+    "Previous Conversation:\n{history}\n\nContext from Docs:\n{context}\n\nCurrent Question: {question}")
+
 MD_DIRECTORY = os.getenv("MD_FOLDER")
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
 LLM_MODEL = os.getenv("LLM_MODEL")
@@ -58,6 +62,18 @@ MAX_PARALLEL_FILES = 3
 def setup_gpu():
     if torch.cuda.is_available():
         torch.cuda.set_per_process_memory_fraction(0.95)
+
+        device_id = torch.cuda.current_device()
+        device_name = torch.cuda.get_device_name(device_id)
+        
+        # VRAM info (in GB)
+        total_vram = torch.cuda.get_device_properties(device_id).total_memory / (1024**3)
+        allocated = torch.cuda.memory_allocated(device_id) / (1024**3)
+        reserved = torch.cuda.memory_reserved(device_id) / (1024**3)
+        free = total_vram - reserved
+        
+        console.print(f"[green]✓ GPU: {device_name}[/green]")
+        console.print(f"[blue]  VRAM: {total_vram:.1f}GB total | {free:.1f}GB free | {allocated:.1f}GB allocated[/blue]")
     else:
         console.print("[yellow]⚠ CPU mode[/yellow]")
 
@@ -260,10 +276,9 @@ class ConversationMemory:
 def get_rag_components(retriever):
     llm = ChatOllama(model=LLM_MODEL, temperature=0.1)
     
-    # FIX 1: Added {history} to the prompt
     prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a precise technical assistant. Cite sources using [filename]. Be concise."),
-        ("human", "Previous Conversation:\n{history}\n\nContext from Docs:\n{context}\n\nCurrent Question: {question}")
+        ("system", SYSTEM_PROMPT),
+        ("human", USER_PROMPT_TEMPLATE)
     ])
     
     return prompt | llm | StrOutputParser()