Files
vi/services/oracle/llm/llm_manager.py
Alex Kazaiev ee1cb5540a Add Oracle service - LLM wrapper
First service for Vi's nervous system:
- Oracle service with NATS integration
- vLLM backend for Qwen3-32B
- GPTQ quantization support
- Thinking mode sampling configs

Simplified from Lyra's patterns, ready to test.

🦊
2026-01-02 13:19:15 -06:00

128 lines
3.8 KiB
Python

"""
LLM Manager for Vi's Oracle service.
Coordinates model loading and text generation.
"""
import time
from typing import Optional, Dict, Any
from core.logger import setup_logger
from .model_loader import ModelLoader
from .generator import TextGenerator
logger = setup_logger('llm_manager', service_name='oracle_service')
class LLMManager:
"""High-level LLM manager"""
def __init__(self, model_path: str = None):
self.model_loader = ModelLoader(model_path)
self.generator = None
# Sampling config for Qwen3 thinking mode
self.thinking_mode_config = {
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"min_p": 0.0,
"max_new_tokens": 8192,
"repetition_penalty": 1.1,
"do_sample": True,
}
self.non_thinking_mode_config = {
"temperature": 0.7,
"top_p": 0.8,
"top_k": 20,
"min_p": 0.0,
"max_new_tokens": 8192,
"repetition_penalty": 1.1,
"do_sample": True,
}
self.sampling_config = self.thinking_mode_config.copy()
@property
def is_loaded(self) -> bool:
return self.model_loader.is_loaded
@property
def model_path(self) -> str:
return self.model_loader.model_path
@property
def model_name(self) -> Optional[str]:
return self.model_loader.model_name
@property
def backend_type(self) -> Optional[str]:
return self.model_loader.backend_type
async def load_model(self, model_path: Optional[str] = None) -> bool:
"""Load model and initialize generator"""
success = await self.model_loader.load_model(model_path)
if success and self.model_loader.llm:
self.generator = TextGenerator(self.model_loader.llm, self.sampling_config)
logger.info("[✺] TextGenerator initialized")
return success
async def unload_model(self):
"""Unload model"""
self.generator = None
await self.model_loader.unload_model()
def get_model_info(self) -> Dict[str, Any]:
"""Get model information"""
return self.model_loader.get_model_info()
async def generate_response(
self,
prompt: str,
temperature: float = None,
max_tokens: int = None,
enable_thinking: bool = True
) -> Optional[str]:
"""Generate a response using the loaded model"""
try:
if not self.is_loaded:
logger.warning("[✺] Model not loaded")
if not await self.load_model():
return "I'm having trouble thinking right now."
mode_config = self.thinking_mode_config if enable_thinking else self.non_thinking_mode_config
if temperature is None:
temperature = mode_config["temperature"]
if max_tokens is None:
max_tokens = mode_config["max_new_tokens"]
logger.info(f"[✺] Generating - temp: {temperature}, max_tokens: {max_tokens}")
start_time = time.time()
raw_text = self.generator.generate(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=mode_config["top_p"],
top_k=mode_config["top_k"],
min_p=mode_config["min_p"]
)
elapsed = time.time() - start_time
if raw_text:
logger.info(f"[✺] Generated {len(raw_text)} chars in {elapsed:.2f}s")
return raw_text.strip()
else:
logger.warning("[✺] Empty response")
return ""
except Exception as e:
logger.error(f"[✺] Generation failed: {e}")
return "I encountered an error while thinking."