Add Oracle service - LLM wrapper

First service for Vi's nervous system: - Oracle service with NATS integration - vLLM backend for Qwen3-32B - GPTQ quantization support - Thinking mode sampling configs Simplified from Lyra's patterns, ready to test. 🦊✺
2026-01-02 13:19:15 -06:00
parent e2d24a66f1
commit ee1cb5540a
8 changed files with 552 additions and 0 deletions
--- a/services/oracle/llm/generator.py
+++ b/services/oracle/llm/generator.py
@@ -0,0 +1,69 @@
+"""
+Text generation using vLLM.
+"""
+
+from typing import Optional
+from core.logger import setup_logger
+
+logger = setup_logger('text_generator', service_name='oracle_service')
+
+
+class TextGenerator:
+    """Text generation with vLLM"""
+
+    def __init__(self, llm_model, sampling_config: dict):
+        self.llm = llm_model
+        self.sampling_config = sampling_config
+
+    def generate(
+        self,
+        prompt: str,
+        max_tokens: int = None,
+        temperature: float = None,
+        top_p: float = None,
+        top_k: int = None,
+        min_p: float = None
+    ) -> Optional[str]:
+        """Generate text using vLLM"""
+        if not self.llm:
+            logger.error("[✺] LLM not initialized")
+            return None
+
+        try:
+            params = self.sampling_config.copy()
+
+            if max_tokens is not None:
+                params["max_new_tokens"] = max_tokens
+            if temperature is not None:
+                params["temperature"] = temperature
+            if top_p is not None:
+                params["top_p"] = top_p
+            if top_k is not None:
+                params["top_k"] = top_k
+            if min_p is not None:
+                params["min_p"] = min_p
+
+            from vllm import SamplingParams
+            
+            sampling_params = SamplingParams(
+                temperature=params["temperature"],
+                top_p=params["top_p"],
+                top_k=params.get("top_k", -1),
+                min_p=params.get("min_p", 0.0),
+                max_tokens=params["max_new_tokens"],
+                repetition_penalty=params["repetition_penalty"]
+            )
+
+            outputs = self.llm.generate([prompt], sampling_params)
+
+            if outputs and outputs[0].outputs:
+                raw_text = outputs[0].outputs[0].text
+                logger.info(f"[✺] Generated {len(raw_text)} chars")
+                return raw_text.strip()
+            else:
+                logger.warning("[✺] Empty output")
+                return ""
+
+        except Exception as e:
+            logger.error(f"[✺] Generation failed: {e}")
+            return None