Files
vi/services/oracle/llm/generator.py
Alex Kazaiev ee1cb5540a Add Oracle service - LLM wrapper
First service for Vi's nervous system:
- Oracle service with NATS integration
- vLLM backend for Qwen3-32B
- GPTQ quantization support
- Thinking mode sampling configs

Simplified from Lyra's patterns, ready to test.

🦊
2026-01-02 13:19:15 -06:00

70 lines
2.1 KiB
Python

"""
Text generation using vLLM.
"""
from typing import Optional
from core.logger import setup_logger
logger = setup_logger('text_generator', service_name='oracle_service')
class TextGenerator:
"""Text generation with vLLM"""
def __init__(self, llm_model, sampling_config: dict):
self.llm = llm_model
self.sampling_config = sampling_config
def generate(
self,
prompt: str,
max_tokens: int = None,
temperature: float = None,
top_p: float = None,
top_k: int = None,
min_p: float = None
) -> Optional[str]:
"""Generate text using vLLM"""
if not self.llm:
logger.error("[✺] LLM not initialized")
return None
try:
params = self.sampling_config.copy()
if max_tokens is not None:
params["max_new_tokens"] = max_tokens
if temperature is not None:
params["temperature"] = temperature
if top_p is not None:
params["top_p"] = top_p
if top_k is not None:
params["top_k"] = top_k
if min_p is not None:
params["min_p"] = min_p
from vllm import SamplingParams
sampling_params = SamplingParams(
temperature=params["temperature"],
top_p=params["top_p"],
top_k=params.get("top_k", -1),
min_p=params.get("min_p", 0.0),
max_tokens=params["max_new_tokens"],
repetition_penalty=params["repetition_penalty"]
)
outputs = self.llm.generate([prompt], sampling_params)
if outputs and outputs[0].outputs:
raw_text = outputs[0].outputs[0].text
logger.info(f"[✺] Generated {len(raw_text)} chars")
return raw_text.strip()
else:
logger.warning("[✺] Empty output")
return ""
except Exception as e:
logger.error(f"[✺] Generation failed: {e}")
return None